feat(gpu): create noise and pfail tests pbs128 and packingks

2026-01-06 21:34:05 -05:00 · 2025-12-02 13:33:39 +01:00
parent 92df46f8f2
commit 918cdcb052
11 changed files with 1971 additions and 8 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -65,6 +65,16 @@ void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,

 void cleanup_cuda_integer_decompress_radix_ciphertext_128(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);
+
+void cuda_integer_extract_glwe_128(
+    CudaStreamsFFI streams, void *glwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_list,
+    uint32_t const glwe_index);
+
+void cuda_integer_extract_glwe_64(
+    CudaStreamsFFI streams, void *glwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_list,
+    uint32_t const glwe_index);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -155,3 +155,24 @@ void cleanup_cuda_integer_decompress_radix_ciphertext_128(
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
+
+void cuda_integer_extract_glwe_128(
+    CudaStreamsFFI streams, void *glwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_list,
+    uint32_t const glwe_index) {
+
+  CudaStreams _streams = CudaStreams(streams);
+  host_extract<__uint128_t>(_streams.stream(0), _streams.gpu_index(0),
+                            (__uint128_t *)glwe_array_out, glwe_list,
+                            glwe_index);
+}
+
+void cuda_integer_extract_glwe_64(
+    CudaStreamsFFI streams, void *glwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_list,
+    uint32_t const glwe_index) {
+
+  CudaStreams _streams = CudaStreams(streams);
+  host_extract<__uint64_t>(_streams.stream(0), _streams.gpu_index(0),
+                           (__uint64_t *)glwe_array_out, glwe_list, glwe_index);
+}
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -2349,6 +2349,22 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_integer_extract_glwe_128(
+        streams: CudaStreamsFFI,
+        glwe_array_out: *mut ffi::c_void,
+        glwe_list: *const CudaPackedGlweCiphertextListFFI,
+        glwe_index: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_integer_extract_glwe_64(
+        streams: CudaStreamsFFI,
+        glwe_array_out: *mut ffi::c_void,
+        glwe_list: *const CudaPackedGlweCiphertextListFFI,
+        glwe_index: u32,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_rerand_64(
        streams: CudaStreamsFFI,
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -877,7 +877,7 @@ pub fn cuda_modulus_switch_ciphertext<Scalar>(
    Scalar: UnsignedInteger,
 {
    unsafe {
-        cuda_modulus_switch_ciphertext_async(streams, output_lwe_ciphertext, log_modulus);
+        cuda_modulus_switch_ciphertext_async(streams, &mut *output_lwe_ciphertext, log_modulus);
    }
    streams.synchronize();
 }
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -1,4 +1,5 @@
 use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::CudaStreams;
@@ -16,7 +17,8 @@ use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
    cuda_backend_compress, cuda_backend_decompress, cuda_backend_get_compression_size_on_gpu,
-    cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu, PBSType,
+    cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu, extract_glwe_async,
+    PBSType,
 };
 use crate::prelude::CastInto;
 use crate::shortint::ciphertext::{
@@ -197,6 +199,30 @@ impl<T: UnsignedInteger> CudaPackedGlweCiphertextList<T> {
            meta: self.meta,
        }
    }
+    pub fn extract_glwe(
+        &self,
+        glwe_index: usize,
+        streams: &CudaStreams,
+    ) -> CudaGlweCiphertextList<T> {
+        let meta = self
+            .meta
+            .as_ref()
+            .expect("CudaPackedGlweCiphertextList meta must be set to extract GLWE");
+
+        let mut output_cuda_glwe_list = CudaGlweCiphertextList::new(
+            meta.glwe_dimension,
+            meta.polynomial_size,
+            GlweCiphertextCount(1),
+            meta.ciphertext_modulus,
+            streams,
+        );
+
+        unsafe {
+            extract_glwe_async(streams, &mut output_cuda_glwe_list, self, glwe_index as u32);
+        }
+        streams.synchronize();
+        output_cuda_glwe_list
+    }
 }

 impl<T: UnsignedInteger> Clone for CudaPackedGlweCiphertextList<T> {
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -7,6 +7,7 @@ pub mod server_key;
 #[cfg(feature = "zk-pok")]
 pub mod zk;

+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
 use crate::core_crypto::gpu::lwe_bootstrap_key::CudaModulusSwitchNoiseReductionConfiguration;
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::lwe_compact_ciphertext_list::CudaLweCompactCiphertextList;
@@ -10423,3 +10424,44 @@ pub unsafe fn unchecked_small_scalar_mul_integer_async(
        carry_modulus.0 as u32,
    );
 }
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn extract_glwe_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaGlweCiphertextList<T>,
+    glwe_list: &CudaPackedGlweCiphertextList<T>,
+    glwe_index: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_array_out.0.d_vec.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_list.data.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let packed_glwe_list_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_list);
+
+    if T::BITS == 128 {
+        cuda_integer_extract_glwe_128(
+            streams.ffi(),
+            glwe_array_out.0.d_vec.as_mut_c_ptr(0),
+            &raw const packed_glwe_list_ffi,
+            glwe_index,
+        );
+    } else if T::BITS == 64 {
+        cuda_integer_extract_glwe_64(
+            streams.ffi(),
+            glwe_array_out.0.d_vec.as_mut_c_ptr(0),
+            &raw const packed_glwe_list_ffi,
+            glwe_index,
+        );
+    } else {
+        panic!("Unsupported integer size for CUDA GLWE extraction");
+    }
+}
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_packingks_ms.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_packingks_ms.rs
@@ -0,0 +1,756 @@
+use super::utils::noise_simulation::{CudaDynLwe, CudaSideResources};
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{GlweCiphertext, LweCiphertext};
+use crate::integer::compression_keys::CompressionPrivateKeys;
+use crate::integer::gpu::list_compression::server_keys::CudaCompressionKey;
+use crate::integer::gpu::server_key::radix::tests_noise_distribution::utils::noise_simulation::cuda_glwe_list_to_glwe_ciphertext;
+use crate::integer::gpu::server_key::radix::tests_unsigned::create_gpu_parameterized_test;
+use crate::integer::gpu::server_key::radix::CudaUnsignedRadixCiphertext;
+use crate::integer::gpu::CudaServerKey;
+use crate::integer::{ClientKey, CompressedServerKey, IntegerCiphertext};
+use crate::shortint::ciphertext::{Ciphertext, Degree, NoiseLevel};
+use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey;
+use crate::shortint::engine::ShortintEngine;
+use crate::shortint::parameters::test_params::TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128;
+use crate::shortint::parameters::{CompressionParameters, MetaParameters, Variance};
+use crate::shortint::server_key::tests::noise_distribution::br_dp_packingks_ms::br_dp_packing_ks_ms;
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{
+    NoiseSimulationGlwe, NoiseSimulationLwe, NoiseSimulationLweFourierBsk,
+    NoiseSimulationLwePackingKeyswitchKey, NoiseSimulationModulus,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::{
+    expected_pfail_for_precision, mean_and_variance_check, normality_check, pfail_check,
+    precision_with_padding, update_ap_params_msg_and_carry_moduli, DecryptionAndNoiseResult,
+    NoiseSample, PfailAndPrecision, PfailTestMeta, PfailTestResult,
+};
+use crate::shortint::server_key::tests::noise_distribution::{
+    should_run_short_pfail_tests_debug, should_use_single_key_debug,
+};
+use crate::shortint::{
+    AtomicPatternParameters, CarryModulus, MessageModulus, ShortintEncoding, ShortintParameterSet,
+};
+use crate::GpuIndex;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+fn sanity_check_encrypt_br_dp_packing_ks_ms(meta_params: MetaParameters) {
+    let (params, comp_params) = (
+        meta_params.compute_parameters,
+        meta_params.compression_parameters.unwrap(),
+    );
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let private_compression_key = cks.new_compression_private_key(comp_params);
+    let (compressed_compression_key, _compressed_decompression_key) =
+        cks.new_compressed_compression_decompression_keys(&private_compression_key);
+    let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams);
+    let lwe_per_glwe = cuda_compression_key.lwe_per_glwe;
+    // The multiplication done in the compression is made to move the message up at the top of the
+    // carry space, multiplying by the carry modulus achieves that
+    let dp_scalar = params.carry_modulus().0;
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+    let storage_modulus_log = cuda_compression_key.storage_log_modulus;
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams);
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0)
+        .map(|_| {
+            cks.key
+                .encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0)
+        })
+        .collect();
+    let d_input_zeros: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), &streams);
+            CudaDynLwe::U64(d_ct_input)
+        })
+        .collect();
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(&streams, cuda_block_info))
+        .collect();
+
+    let (d_before_packing, _after_packing, d_after_ms) = br_dp_packing_ks_ms(
+        d_input_zeros,
+        &cuda_sks,
+        &d_accumulator,
+        dp_scalar,
+        &cuda_compression_key.packing_key_switching_key,
+        storage_modulus_log,
+        &mut cuda_side_resources,
+    );
+
+    let compression_inputs: Vec<_> = d_before_packing
+        .into_iter()
+        .map(|(_input, pbs_result, _dp_result)| {
+            let pbs_result_list_cpu = pbs_result.as_lwe_64().to_lwe_ciphertext_list(&streams);
+            let pbs_result_cpu = LweCiphertext::from_container(
+                pbs_result_list_cpu.clone().into_container(),
+                pbs_result_list_cpu.ciphertext_modulus(),
+            );
+            let cpu_ct = Ciphertext::new(
+                pbs_result_cpu,
+                Degree::new(params.message_modulus().0 - 1),
+                NoiseLevel::NOMINAL,
+                params.message_modulus(),
+                params.carry_modulus(),
+                params.atomic_pattern(),
+            );
+            let radix_ct = crate::integer::RadixCiphertext::from_blocks(vec![cpu_ct]);
+            let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&radix_ct, &streams);
+            d_ct.ciphertext
+        })
+        .collect();
+
+    let gpu_compressed =
+        cuda_compression_key.compress_ciphertexts_into_list(&compression_inputs, &streams);
+
+    let gpu_extracted = gpu_compressed.extract_glwe(0, &streams);
+    let extracted_list = gpu_extracted.to_glwe_ciphertext_list(&streams);
+    let extracted_glwe = GlweCiphertext::from_container(
+        extracted_list.clone().into_container(),
+        extracted_list.polynomial_size(),
+        extracted_list.ciphertext_modulus(),
+    );
+    let after_ms_list = d_after_ms.to_glwe_ciphertext_list(&streams);
+    let mut after_ms = GlweCiphertext::from_container(
+        after_ms_list.clone().into_container(),
+        after_ms_list.polynomial_size(),
+        after_ms_list.ciphertext_modulus(),
+    );
+    // Bodies that were not filled are discarded
+    after_ms.get_mut_body().as_mut()[lwe_per_glwe.0..].fill(0);
+
+    assert_eq!(after_ms.as_view(), extracted_glwe.as_view());
+}
+
+create_gpu_parameterized_test!(sanity_check_encrypt_br_dp_packing_ks_ms {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+#[allow(clippy::type_complexity, clippy::too_many_arguments)]
+fn encrypt_br_dp_packing_ks_ms_inner_helper_gpu(
+    params: AtomicPatternParameters,
+    comp_params: CompressionParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_compression_private_key: &CompressionPrivateKeys,
+    single_cuda_compression_key: &CudaCompressionKey,
+    msg: u64,
+    streams: &CudaStreams,
+) -> (
+    Vec<(
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+    )>,
+    Vec<DecryptionAndNoiseResult>,
+    Vec<DecryptionAndNoiseResult>,
+) {
+    let mut engine = ShortintEngine::new();
+    let thread_cks: crate::integer::ClientKey;
+    let thread_cuda_sks: CudaServerKey;
+    let thread_compression_private_key;
+    let thread_cuda_compression_key;
+    let (cks, cuda_sks, compression_private_key, cuda_compression_key) =
+        if should_use_single_key_debug() {
+            (
+                single_cks,
+                single_cuda_sks,
+                single_compression_private_key,
+                single_cuda_compression_key,
+            )
+        } else {
+            let block_params: ShortintParameterSet = params.into();
+            thread_cks = crate::integer::ClientKey::new(block_params);
+            let compressed_server_key =
+                CompressedServerKey::new_radix_compressed_server_key(&thread_cks);
+            thread_cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, streams);
+
+            thread_compression_private_key = thread_cks.new_compression_private_key(comp_params);
+            let (compressed_compression_key, _compressed_decompression_key) = thread_cks
+                .new_compressed_compression_decompression_keys(&thread_compression_private_key);
+            thread_cuda_compression_key = compressed_compression_key.decompress_to_cuda(streams);
+
+            (
+                &thread_cks,
+                &thread_cuda_sks,
+                &thread_compression_private_key,
+                &thread_cuda_compression_key,
+            )
+        };
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+    let lwe_per_glwe = cuda_compression_key.lwe_per_glwe;
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0)
+        .map(|_| {
+            cks.key.encrypt_noiseless_pbs_input_dyn_lwe_with_engine(
+                br_input_modulus_log,
+                msg,
+                &mut engine,
+            )
+        })
+        .collect();
+
+    let d_input_zeros: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), streams);
+            CudaDynLwe::U64(d_ct_input)
+        })
+        .collect();
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, streams);
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(streams, cuda_block_info))
+        .collect();
+
+    let dp_scalar = params.carry_modulus().0;
+    let storage_modulus_log = cuda_compression_key.storage_log_modulus;
+
+    let (d_before_packing, d_after_packing, d_after_ms) = br_dp_packing_ks_ms(
+        d_input_zeros,
+        cuda_sks,
+        &d_accumulator,
+        dp_scalar,
+        &cuda_compression_key.packing_key_switching_key,
+        storage_modulus_log,
+        &mut cuda_side_resources,
+    );
+
+    let compute_large_lwe_secret_key = cks.key.encryption_key();
+    let compression_glwe_secret_key = &compression_private_key.key.post_packing_ks_key;
+
+    let compute_encoding = cuda_sks.encoding();
+    let compression_encoding = ShortintEncoding {
+        carry_modulus: CarryModulus(1),
+        ..compute_encoding
+    };
+    let after_packing = cuda_glwe_list_to_glwe_ciphertext(&d_after_packing, streams);
+    let after_ms = cuda_glwe_list_to_glwe_ciphertext(&d_after_ms, streams);
+    (
+        d_before_packing
+            .into_iter()
+            .map(|(d_input, d_pbs_result, d_dp_result)| {
+                let input = d_input.as_ct_64_cpu(streams);
+                let pbs_result = d_pbs_result.as_ct_64_cpu(streams);
+                let dp_result = d_dp_result.as_ct_64_cpu(streams);
+                (
+                    match &cks.key.atomic_pattern {
+                        AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => {
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &input,
+                                &standard_atomic_pattern_client_key.lwe_secret_key,
+                                msg,
+                                &compute_encoding,
+                            )
+                        }
+                        AtomicPatternClientKey::KeySwitch32(_ks32_atomic_pattern_client_key) => {
+                            panic!("KS32 Atomic Pattern not supported on GPU tests yet");
+                        }
+                    },
+                    DecryptionAndNoiseResult::new_from_lwe(
+                        &pbs_result,
+                        &compute_large_lwe_secret_key,
+                        msg,
+                        &compute_encoding,
+                    ),
+                    DecryptionAndNoiseResult::new_from_lwe(
+                        &dp_result,
+                        &compute_large_lwe_secret_key,
+                        msg,
+                        &compression_encoding,
+                    ),
+                )
+            })
+            .collect(),
+        DecryptionAndNoiseResult::new_from_glwe(
+            &after_packing,
+            compression_glwe_secret_key,
+            compression_private_key.key.params.lwe_per_glwe(),
+            msg,
+            &compression_encoding,
+        ),
+        DecryptionAndNoiseResult::new_from_glwe(
+            &after_ms,
+            compression_glwe_secret_key,
+            compression_private_key.key.params.lwe_per_glwe(),
+            msg,
+            &compression_encoding,
+        ),
+    )
+}
+
+#[allow(clippy::type_complexity, clippy::too_many_arguments)]
+fn encrypt_br_dp_packing_ks_ms_noise_helper_gpu(
+    params: AtomicPatternParameters,
+    comp_params: CompressionParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_compression_private_key: &CompressionPrivateKeys,
+    single_cuda_compression_key: &CudaCompressionKey,
+    msg: u64,
+    streams: &CudaStreams,
+) -> (
+    Vec<(NoiseSample, NoiseSample, NoiseSample)>,
+    Vec<NoiseSample>,
+    Vec<NoiseSample>,
+) {
+    let (before_packing, after_packing, after_ms) = encrypt_br_dp_packing_ks_ms_inner_helper_gpu(
+        params,
+        comp_params,
+        single_cks,
+        single_cuda_sks,
+        single_compression_private_key,
+        single_cuda_compression_key,
+        msg,
+        streams,
+    );
+
+    (
+        before_packing
+            .into_iter()
+            .map(|(input, after_pbs, after_dp)| {
+                (
+                    input
+                        .get_noise_if_decryption_was_correct()
+                        .expect("Decryption Failed"),
+                    after_pbs
+                        .get_noise_if_decryption_was_correct()
+                        .expect("Decryption Failed"),
+                    after_dp
+                        .get_noise_if_decryption_was_correct()
+                        .expect("Decryption Failed"),
+                )
+            })
+            .collect(),
+        after_packing
+            .into_iter()
+            .map(|x| {
+                x.get_noise_if_decryption_was_correct()
+                    .expect("Decryption Failed")
+            })
+            .collect(),
+        after_ms
+            .into_iter()
+            .map(|x| {
+                x.get_noise_if_decryption_was_correct()
+                    .expect("Decryption Failed")
+            })
+            .collect(),
+    )
+}
+#[allow(clippy::type_complexity, clippy::too_many_arguments)]
+fn encrypt_br_dp_packing_ks_ms_pfail_helper_gpu(
+    params: AtomicPatternParameters,
+    comp_params: CompressionParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_compression_private_key: &CompressionPrivateKeys,
+    single_cuda_compression_key: &CudaCompressionKey,
+    msg: u64,
+    streams: &CudaStreams,
+) -> Vec<DecryptionAndNoiseResult> {
+    let (_before_packing, _after_packing, after_ms) = encrypt_br_dp_packing_ks_ms_inner_helper_gpu(
+        params,
+        comp_params,
+        single_cks,
+        single_cuda_sks,
+        single_compression_private_key,
+        single_cuda_compression_key,
+        msg,
+        streams,
+    );
+
+    after_ms
+}
+
+fn noise_check_encrypt_br_dp_packing_ks_ms_noise_gpu(meta_params: MetaParameters) {
+    let (params, comp_params) = (
+        meta_params.compute_parameters,
+        meta_params.compression_parameters.unwrap(),
+    );
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let private_compression_key = cks.new_compression_private_key(comp_params);
+    let (compressed_compression_key, _compressed_decompression_key) =
+        cks.new_compressed_compression_decompression_keys(&private_compression_key);
+    let compression_key = compressed_compression_key.decompress();
+    let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams);
+
+    let noise_simulation_bsk =
+        NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(params);
+    let noise_simulation_packing_key =
+        NoiseSimulationLwePackingKeyswitchKey::new_from_comp_parameters(params, comp_params);
+
+    assert!(noise_simulation_bsk.matches_actual_bsk_gpu(&cuda_sks.bootstrapping_key));
+    assert!(noise_simulation_packing_key.matches_actual_shortint_comp_key(&compression_key.key));
+
+    // The multiplication done in the compression is made to move the message up at the top of the
+    // carry space, multiplying by the carry modulus achieves that
+    let dp_scalar = params.carry_modulus().0;
+
+    let noise_simulation_accumulator = NoiseSimulationGlwe::new(
+        noise_simulation_bsk.output_glwe_size().to_glwe_dimension(),
+        noise_simulation_bsk.output_polynomial_size(),
+        Variance(0.0),
+        noise_simulation_bsk.modulus(),
+    );
+
+    let lwe_per_glwe = cuda_compression_key.lwe_per_glwe;
+    let storage_modulus_log = cuda_compression_key.storage_log_modulus;
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+
+    let (_before_packing_sim, _after_packing_sim, after_ms_sim) = {
+        let noise_simulation = NoiseSimulationLwe::new(
+            cks.parameters().lwe_dimension(),
+            Variance(0.0),
+            NoiseSimulationModulus::from_ciphertext_modulus(cks.parameters().ciphertext_modulus()),
+        );
+        br_dp_packing_ks_ms(
+            vec![noise_simulation; lwe_per_glwe.0],
+            &noise_simulation_bsk,
+            &noise_simulation_accumulator,
+            dp_scalar,
+            &noise_simulation_packing_key,
+            storage_modulus_log,
+            &mut vec![(); lwe_per_glwe.0],
+        )
+    };
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0)
+        .map(|_| {
+            cks.key
+                .encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0)
+        })
+        .collect();
+
+    let d_input_zeros: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), &streams);
+            CudaDynLwe::U64(d_ct_input)
+        })
+        .collect();
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams);
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(&streams, cuda_block_info))
+        .collect();
+
+    // Check that the circuit is correct with respect to core implementation, i.e. does not crash on
+    // dimension checks
+    let (expected_glwe_size_out, expected_polynomial_size_out, expected_modulus_f64_out) = {
+        let (_before_packing_sim, _after_packing, after_ms) = br_dp_packing_ks_ms(
+            d_input_zeros,
+            &cuda_sks,
+            &d_accumulator,
+            dp_scalar,
+            &cuda_compression_key.packing_key_switching_key,
+            storage_modulus_log,
+            &mut cuda_side_resources,
+        );
+
+        (
+            after_ms.glwe_dimension().to_glwe_size(),
+            after_ms.polynomial_size(),
+            after_ms.ciphertext_modulus().raw_modulus_float(),
+        )
+    };
+
+    assert_eq!(after_ms_sim.glwe_size(), expected_glwe_size_out);
+    assert_eq!(after_ms_sim.polynomial_size(), expected_polynomial_size_out);
+    assert_eq!(after_ms_sim.modulus().as_f64(), expected_modulus_f64_out);
+
+    let cleartext_modulus = params.message_modulus().0 * params.carry_modulus().0;
+    let mut noise_samples_before_ms = vec![];
+    let mut noise_samples_after_ms = vec![];
+
+    let sample_count_per_msg = 1000usize;
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    for _ in 0..cleartext_modulus {
+        let (current_noise_samples_before_ms, current_noise_samples_after_ms): (Vec<_>, Vec<_>) =
+            (0..sample_count_per_msg)
+                .collect::<Vec<_>>()
+                .chunks(chunk_size)
+                .flat_map(|chunk| {
+                    chunk
+                        .into_par_iter()
+                        .map(|i| {
+                            let local_stream = &vec_local_streams[*i % chunk_size];
+                            let (_before_packing, after_packing, after_ms) =
+                                encrypt_br_dp_packing_ks_ms_noise_helper_gpu(
+                                    params,
+                                    comp_params,
+                                    &cks,
+                                    &cuda_sks,
+                                    &private_compression_key,
+                                    &cuda_compression_key,
+                                    0,
+                                    local_stream,
+                                );
+                            (after_packing, after_ms)
+                        })
+                        .collect::<Vec<_>>()
+                })
+                .unzip();
+
+        noise_samples_before_ms.extend(current_noise_samples_before_ms);
+        noise_samples_after_ms.extend(current_noise_samples_after_ms);
+    }
+
+    let noise_samples_before_ms_flattened: Vec<_> = noise_samples_before_ms
+        .into_iter()
+        .flatten()
+        .map(|x| x.value)
+        .collect();
+
+    let noise_samples_after_ms_flattened: Vec<_> = noise_samples_after_ms
+        .into_iter()
+        .flatten()
+        .map(|x| x.value)
+        .collect();
+
+    let before_ms_normality =
+        normality_check(&noise_samples_before_ms_flattened, "before ms", 0.01);
+
+    let after_ms_is_ok = mean_and_variance_check(
+        &noise_samples_after_ms_flattened,
+        "after_ms",
+        0.0,
+        after_ms_sim.variance_per_occupied_slot(),
+        comp_params.packing_ks_key_noise_distribution(),
+        after_ms_sim
+            .glwe_dimension()
+            .to_equivalent_lwe_dimension(after_ms_sim.polynomial_size()),
+        after_ms_sim.modulus().as_f64(),
+    );
+
+    assert!(before_ms_normality.null_hypothesis_is_valid && after_ms_is_ok);
+}
+create_gpu_parameterized_test!(noise_check_encrypt_br_dp_packing_ks_ms_noise_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+fn noise_check_encrypt_br_dp_packing_ks_ms_pfail_gpu(meta_params: MetaParameters) {
+    let (pfail_test_meta, params, comp_params) = {
+        let (mut params, comp_params) = (
+            meta_params.compute_parameters,
+            meta_params.compression_parameters.unwrap(),
+        );
+
+        let original_message_modulus = params.message_modulus();
+        let original_carry_modulus = params.carry_modulus();
+
+        // For now only allow 2_2 parameters, and see later for heuristics to use
+        assert_eq!(original_message_modulus.0, 4);
+        assert_eq!(original_carry_modulus.0, 4);
+
+        let noise_simulation_bsk =
+            NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(params);
+        let noise_simulation_packing_key =
+            NoiseSimulationLwePackingKeyswitchKey::new_from_comp_parameters(params, comp_params);
+
+        // The multiplication done in the compression is made to move the message up at the top of
+        // the carry space, multiplying by the carry modulus achieves that
+        let dp_scalar = params.carry_modulus().0;
+
+        let noise_simulation_accumulator = NoiseSimulationGlwe::new(
+            noise_simulation_bsk.output_glwe_size().to_glwe_dimension(),
+            noise_simulation_bsk.output_polynomial_size(),
+            Variance(0.0),
+            noise_simulation_bsk.modulus(),
+        );
+
+        let lwe_per_glwe = comp_params.lwe_per_glwe();
+        let storage_modulus_log = comp_params.storage_log_modulus();
+
+        let (_before_packing_sim, _after_packing_sim, after_ms_sim) = {
+            let noise_simulation = NoiseSimulationLwe::new(
+                params.lwe_dimension(),
+                Variance(0.0),
+                NoiseSimulationModulus::from_ciphertext_modulus(params.ciphertext_modulus()),
+            );
+            br_dp_packing_ks_ms(
+                vec![noise_simulation; lwe_per_glwe.0],
+                &noise_simulation_bsk,
+                &noise_simulation_accumulator,
+                dp_scalar,
+                &noise_simulation_packing_key,
+                storage_modulus_log,
+                &mut vec![(); lwe_per_glwe.0],
+            )
+        };
+
+        let expected_variance_after_storage = after_ms_sim.variance_per_occupied_slot();
+
+        let compression_carry_mod = CarryModulus(1);
+        let compression_message_mod = original_message_modulus;
+        let compression_precision_with_padding =
+            precision_with_padding(compression_message_mod, compression_carry_mod);
+        let expected_pfail_for_storage = expected_pfail_for_precision(
+            compression_precision_with_padding,
+            expected_variance_after_storage,
+        );
+
+        let original_pfail_and_precision = PfailAndPrecision::new(
+            expected_pfail_for_storage,
+            compression_message_mod,
+            compression_carry_mod,
+        );
+
+        // Here we update the message modulus only:
+        // - because the message modulus matches for the compression encoding and compute encoding
+        // - so that the carry modulus stays the same and we apply the same dot product as normal
+        //   for 2_2
+        // - so that the effective encoding after the storage is the one we used to evaluate the
+        //   pfail
+        let updated_message_mod = MessageModulus(1 << 6);
+        let updated_carry_mod = compression_carry_mod;
+
+        update_ap_params_msg_and_carry_moduli(&mut params, updated_message_mod, updated_carry_mod);
+
+        assert!(
+            (params.message_modulus().0 * params.carry_modulus().0).ilog2()
+                <= comp_params.storage_log_modulus().0 as u32,
+            "Compression storage modulus cannot store enough bits for pfail estimation"
+        );
+
+        let updated_precision_with_padding =
+            precision_with_padding(updated_message_mod, updated_carry_mod);
+
+        let new_expected_pfail_for_storage = expected_pfail_for_precision(
+            updated_precision_with_padding,
+            expected_variance_after_storage,
+        );
+
+        let new_expected_pfail_and_precision = PfailAndPrecision::new(
+            new_expected_pfail_for_storage,
+            updated_message_mod,
+            updated_carry_mod,
+        );
+
+        let pfail_test_meta = if should_run_short_pfail_tests_debug() {
+            // To have the same amount of keys generated as the case where a single run is a single
+            // sample
+            let expected_fails = 200 * lwe_per_glwe.0 as u32;
+            PfailTestMeta::new_with_desired_expected_fails(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                expected_fails,
+            )
+        } else {
+            // To guarantee 1_000_000 keysets are generated
+            let total_runs = 1_000_000 * lwe_per_glwe.0 as u32;
+            PfailTestMeta::new_with_total_runs(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                total_runs,
+            )
+        };
+
+        (pfail_test_meta, params, comp_params)
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let private_compression_key = cks.new_compression_private_key(comp_params);
+    let (compressed_compression_key, _compressed_decompression_key) =
+        cks.new_compressed_compression_decompression_keys(&private_compression_key);
+
+    let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams);
+
+    let lwe_per_glwe = cuda_compression_key.lwe_per_glwe;
+
+    let total_runs_for_expected_fails = pfail_test_meta
+        .total_runs_for_expected_fails()
+        .div_ceil(lwe_per_glwe.0.try_into().unwrap());
+
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+
+    let measured_fails: f64 = (0..total_runs_for_expected_fails)
+        .collect::<Vec<_>>()
+        .chunks(chunk_size)
+        .flat_map(|chunk| {
+            chunk
+                .into_par_iter()
+                .map(|i| {
+                    let local_streams = &vec_local_streams[*i as usize % chunk_size];
+                    let after_ms_decryption_result = encrypt_br_dp_packing_ks_ms_pfail_helper_gpu(
+                        params,
+                        comp_params,
+                        &cks,
+                        &cuda_sks,
+                        &private_compression_key,
+                        &cuda_compression_key,
+                        0,
+                        local_streams,
+                    );
+                    after_ms_decryption_result
+                        .into_iter()
+                        .map(|result| result.failure_as_f64())
+                        .sum::<f64>()
+                })
+                .collect::<Vec<_>>()
+        })
+        .sum();
+
+    let test_result = PfailTestResult { measured_fails };
+
+    pfail_check(&pfail_test_meta, test_result);
+}
+
+create_gpu_parameterized_test!(noise_check_encrypt_br_dp_packing_ks_ms_pfail_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_pbs_128_packingks.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_pbs_128_packingks.rs
@@ -0,0 +1,872 @@
+use super::utils::noise_simulation::{CudaDynLwe, CudaSideResources};
+use crate::core_crypto::commons::noise_formulas::noise_simulation::{
+    NoiseSimulationLweFourier128Bsk, NoiseSimulationLwePackingKeyswitchKey,
+};
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{GlweCiphertext, LweCiphertextCount};
+use crate::integer::gpu::CudaServerKey;
+use crate::integer::noise_squashing::NoiseSquashingPrivateKey;
+use crate::integer::CompressedServerKey;
+
+use crate::core_crypto::commons::parameters::CiphertextModulusLog;
+use crate::core_crypto::prelude::generate_programmable_bootstrap_glwe_lut;
+use crate::integer::ciphertext::NoiseSquashingCompressionPrivateKey;
+use crate::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey;
+use crate::integer::gpu::server_key::radix::tests_unsigned::create_gpu_parameterized_test;
+use crate::integer::gpu::server_key::radix::{CudaNoiseSquashingKey, CudaUnsignedRadixCiphertext};
+use crate::integer::gpu::unchecked_small_scalar_mul_integer_async;
+use crate::integer::IntegerCiphertext;
+use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey;
+use crate::shortint::parameters::noise_squashing::NoiseSquashingParameters;
+use crate::shortint::parameters::test_params::TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128;
+use crate::shortint::parameters::{
+    AtomicPatternParameters, MetaParameters, NoiseSquashingCompressionParameters, Variance,
+};
+use crate::shortint::server_key::tests::noise_distribution::dp_ks_pbs128_packingks::{
+    dp_ks_any_ms_standard_pbs128, dp_ks_any_ms_standard_pbs128_packing_ks,
+};
+use crate::shortint::server_key::tests::noise_distribution::should_use_single_key_debug;
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{
+    NoiseSimulationGlwe, NoiseSimulationLwe, NoiseSimulationLweFourierBsk,
+    NoiseSimulationLweKeyswitchKey, NoiseSimulationModulusSwitchConfig,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::{
+    mean_and_variance_check, DecryptionAndNoiseResult, NoiseSample,
+};
+use crate::shortint::{PaddingBit, ShortintEncoding, ShortintParameterSet};
+use crate::GpuIndex;
+use rayon::prelude::*;
+
+/// Test function to verify that the noise checking tools match the actual atomic patterns
+/// implemented in shortint for GPU
+fn sanity_check_encrypt_dp_ks_standard_pbs128_packing_ks_gpu(meta_params: MetaParameters) {
+    let (atomic_params, noise_squashing_params, noise_squashing_compression_params) = {
+        let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap();
+        (
+            meta_params.compute_parameters,
+            meta_noise_squashing_params.parameters,
+            meta_noise_squashing_params.compression_parameters.unwrap(),
+        )
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = atomic_params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params);
+    let compressed_noise_squashing_compression_key =
+        cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
+    let noise_squashing_key = compressed_noise_squashing_compression_key.decompress();
+    let cuda_noise_squashing_key =
+        compressed_noise_squashing_compression_key.decompress_to_cuda(&streams);
+    let noise_squashing_compression_private_key =
+        NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params);
+    let noise_squashing_compression_key = noise_squashing_private_key
+        .new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
+    let cuda_noise_squashing_compression_key =
+        CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
+            &noise_squashing_compression_key,
+            &streams,
+        );
+
+    let lwe_per_glwe = cuda_noise_squashing_compression_key.lwe_per_glwe;
+
+    let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config();
+
+    let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log();
+
+    let u128_encoding = ShortintEncoding {
+        ciphertext_modulus: noise_squashing_params.ciphertext_modulus(),
+        message_modulus: noise_squashing_params.message_modulus(),
+        carry_modulus: noise_squashing_params.carry_modulus(),
+        padding_bit: PaddingBit::Yes,
+    };
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let id_lut_cpu = generate_programmable_bootstrap_glwe_lut(
+        noise_squashing_key.key.polynomial_size(),
+        noise_squashing_key.key.glwe_size(),
+        u128_encoding
+            .cleartext_space_without_padding()
+            .try_into()
+            .unwrap(),
+        u128_encoding.ciphertext_modulus,
+        u128_encoding.delta(),
+        |x| x,
+    );
+
+    let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, &streams);
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(0)).collect();
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(atomic_params.message_modulus().0 - 1),
+        message_modulus: atomic_params.message_modulus(),
+        carry_modulus: atomic_params.carry_modulus(),
+        atomic_pattern: atomic_params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(&streams, cuda_block_info))
+        .collect();
+
+    let input_zero_as_lwe: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]),
+                &streams,
+            );
+            CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks)
+        })
+        .collect();
+
+    let (_before_packing, d_after_packing) = dp_ks_any_ms_standard_pbs128_packing_ks(
+        input_zero_as_lwe,
+        max_scalar_mul,
+        &cuda_sks,
+        modulus_switch_config,
+        &cuda_noise_squashing_key,
+        br_input_modulus_log,
+        &id_lut_gpu,
+        &cuda_noise_squashing_compression_key.packing_key_switching_key,
+        &mut cuda_side_resources,
+    );
+
+    let cuda_noise_squashed_cts: Vec<_> = input_zeros
+        .into_par_iter()
+        .map(|ct| {
+            let cloned_ct = ct;
+            let radix_ct = crate::integer::RadixCiphertext::from_blocks(vec![cloned_ct]);
+            let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&radix_ct, &streams);
+            unsafe {
+                unchecked_small_scalar_mul_integer_async(
+                    &streams,
+                    &mut d_ct.ciphertext,
+                    max_scalar_mul,
+                    atomic_params.message_modulus(),
+                    atomic_params.carry_modulus(),
+                );
+            }
+            streams.synchronize();
+            cuda_noise_squashing_key.unchecked_squash_ciphertext_noise(
+                &d_ct.ciphertext,
+                &cuda_sks,
+                &streams,
+            )
+        })
+        .collect();
+
+    let gpu_compressed = cuda_noise_squashing_compression_key
+        .compress_noise_squashed_ciphertexts_into_list(&cuda_noise_squashed_cts, &streams);
+
+    let gpu_extracted = gpu_compressed.extract_glwe(0, &streams);
+    let extracted_list = gpu_extracted.to_glwe_ciphertext_list(&streams);
+    let extracted_glwe = GlweCiphertext::from_container(
+        extracted_list.clone().into_container(),
+        extracted_list.polynomial_size(),
+        extracted_list.ciphertext_modulus(),
+    );
+
+    let after_packing_list = d_after_packing.to_glwe_ciphertext_list(&streams);
+    let mut after_packing = GlweCiphertext::from_container(
+        after_packing_list.clone().into_container(),
+        after_packing_list.polynomial_size(),
+        after_packing_list.ciphertext_modulus(),
+    );
+    // Bodies that were not filled are discarded
+    after_packing.get_mut_body().as_mut()[lwe_per_glwe.0..].fill(0);
+
+    assert_eq!(after_packing.as_view(), extracted_glwe.as_view());
+}
+
+/// Test function to verify that the noise checking tools match the actual atomic patterns
+/// implemented in shortint for GPU
+fn sanity_check_encrypt_dp_ks_standard_pbs128_gpu(meta_params: MetaParameters) {
+    let (params, noise_squashing_params) = {
+        let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap();
+        (
+            meta_params.compute_parameters,
+            meta_noise_squashing_params.parameters,
+        )
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params);
+    let compressed_noise_squashing_compression_key =
+        cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
+    let noise_squashing_key = compressed_noise_squashing_compression_key.decompress();
+    let cuda_noise_squashing_key =
+        compressed_noise_squashing_compression_key.decompress_to_cuda(&streams);
+
+    let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config();
+
+    let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log();
+
+    let u128_encoding = ShortintEncoding {
+        ciphertext_modulus: noise_squashing_params.ciphertext_modulus(),
+        message_modulus: noise_squashing_params.message_modulus(),
+        carry_modulus: noise_squashing_params.carry_modulus(),
+        padding_bit: PaddingBit::Yes,
+    };
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let id_lut_cpu = generate_programmable_bootstrap_glwe_lut(
+        noise_squashing_key.key.polynomial_size(),
+        noise_squashing_key.key.glwe_size(),
+        u128_encoding
+            .cleartext_space_without_padding()
+            .try_into()
+            .unwrap(),
+        u128_encoding.ciphertext_modulus,
+        u128_encoding.delta(),
+        |x| x,
+    );
+
+    let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, &streams);
+
+    let lwe_per_glwe = LweCiphertextCount(128);
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(0)).collect();
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(&streams, cuda_block_info))
+        .collect();
+
+    let input_zero_as_lwe: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]),
+                &streams,
+            );
+            CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks)
+        })
+        .collect();
+
+    let res: Vec<_> = input_zero_as_lwe
+        .into_par_iter()
+        .zip(cuda_side_resources.par_iter_mut())
+        .map(|(input, side_resources)| {
+            let (input, after_dp, ks_result, drift_technique_result, ms_result, pbs_result) =
+                dp_ks_any_ms_standard_pbs128(
+                    input,
+                    max_scalar_mul,
+                    &cuda_sks,
+                    modulus_switch_config,
+                    &cuda_noise_squashing_key,
+                    br_input_modulus_log,
+                    &id_lut_gpu,
+                    side_resources,
+                );
+
+            (
+                input,
+                after_dp,
+                ks_result,
+                drift_technique_result,
+                ms_result,
+                pbs_result,
+            )
+        })
+        .collect();
+
+    let input_zeros_non_pattern: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]),
+                &streams,
+            )
+        })
+        .collect();
+
+    let vector_non_pattern: Vec<_> = input_zeros_non_pattern
+        .into_par_iter()
+        .map(|mut d_ct_input2| {
+            unsafe {
+                unchecked_small_scalar_mul_integer_async(
+                    &streams,
+                    &mut d_ct_input2.ciphertext,
+                    max_scalar_mul,
+                    params.message_modulus(),
+                    params.carry_modulus(),
+                );
+            }
+
+            streams.synchronize();
+
+            cuda_noise_squashing_key
+                .squash_radix_ciphertext_noise(&cuda_sks, &d_ct_input2.ciphertext, &streams)
+                .unwrap()
+        })
+        .collect();
+
+    let vector_pattern_cpu: Vec<_> = res
+        .into_iter()
+        .map(
+            |(_input, _after_dp, _ks_result, _drift_technique_result, _ms_result, pbs_result)| {
+                pbs_result.as_ct_128_cpu(&streams)
+            },
+        )
+        .collect();
+
+    let vector_non_pattern_cpu: Vec<_> = vector_non_pattern
+        .into_par_iter()
+        .map(|cuda_squashed_radix_ct| {
+            let squashed_noise_ct_cpu =
+                cuda_squashed_radix_ct.to_squashed_noise_radix_ciphertext(&streams);
+            squashed_noise_ct_cpu.packed_blocks()[0]
+                .lwe_ciphertext()
+                .clone()
+        })
+        .collect();
+
+    // Compare that all the results are equivalent
+    assert_eq!(vector_pattern_cpu.len(), vector_non_pattern_cpu.len());
+    for (a, b) in vector_pattern_cpu.iter().zip(vector_non_pattern_cpu.iter()) {
+        assert_eq!(a.as_view(), b.as_view());
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+fn encrypt_dp_ks_standard_pbs128_packing_ks_inner_helper_gpu(
+    params: AtomicPatternParameters,
+    noise_squashing_params: NoiseSquashingParameters,
+    noise_squashing_compression_params: NoiseSquashingCompressionParameters,
+    single_cks: &crate::integer::ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_noise_squashing_private_key: &NoiseSquashingPrivateKey,
+    single_noise_squashing_key: &crate::integer::noise_squashing::NoiseSquashingKey,
+    single_cuda_noise_squashing_key: &CudaNoiseSquashingKey,
+    single_noise_squashing_compression_private_key: &NoiseSquashingCompressionPrivateKey,
+    single_cuda_noise_squashing_compression_key: &CudaNoiseSquashingCompressionKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    Vec<(
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+    )>,
+    Vec<DecryptionAndNoiseResult>,
+) {
+    let thread_cks: crate::integer::ClientKey;
+    let thread_cuda_sks: CudaServerKey;
+    let thread_noise_squashing_private_key: NoiseSquashingPrivateKey;
+    let thread_noise_squashing_key: crate::integer::noise_squashing::NoiseSquashingKey;
+    let thread_cuda_noise_squashing_key: CudaNoiseSquashingKey;
+    let thread_noise_squashing_compression_private_key: NoiseSquashingCompressionPrivateKey;
+    let thread_cuda_noise_squashing_compression_key: CudaNoiseSquashingCompressionKey;
+    let (
+        cks,
+        cuda_sks,
+        noise_squashing_private_key,
+        noise_squashing_key,
+        cuda_noise_squashing_key,
+        noise_squashing_compression_private_key,
+        cuda_noise_squashing_compression_key,
+    ) = if should_use_single_key_debug() {
+        (
+            single_cks,
+            single_cuda_sks,
+            single_noise_squashing_private_key,
+            single_noise_squashing_key,
+            single_cuda_noise_squashing_key,
+            single_noise_squashing_compression_private_key,
+            single_cuda_noise_squashing_compression_key,
+        )
+    } else {
+        let block_params: ShortintParameterSet = params.into();
+        thread_cks = crate::integer::ClientKey::new(block_params);
+        let thread_compressed_server_key =
+            CompressedServerKey::new_radix_compressed_server_key(&thread_cks);
+        thread_cuda_sks =
+            CudaServerKey::decompress_from_cpu(&thread_compressed_server_key, streams);
+
+        thread_noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params);
+        let thread_compressed_noise_squashing_compression_key =
+            thread_cks.new_compressed_noise_squashing_key(&thread_noise_squashing_private_key);
+        thread_noise_squashing_key = thread_compressed_noise_squashing_compression_key.decompress();
+        thread_cuda_noise_squashing_key =
+            thread_compressed_noise_squashing_compression_key.decompress_to_cuda(streams);
+        thread_noise_squashing_compression_private_key =
+            NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params);
+        let thread_noise_squashing_compression_key = thread_noise_squashing_private_key
+            .new_noise_squashing_compression_key(&thread_noise_squashing_compression_private_key);
+        thread_cuda_noise_squashing_compression_key =
+            CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
+                &thread_noise_squashing_compression_key,
+                streams,
+            );
+        (
+            &thread_cks,
+            &thread_cuda_sks,
+            &thread_noise_squashing_private_key,
+            &thread_noise_squashing_key,
+            &thread_cuda_noise_squashing_key,
+            &thread_noise_squashing_compression_private_key,
+            &thread_cuda_noise_squashing_compression_key,
+        )
+    };
+
+    let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config();
+
+    let bsk_polynomial_size = noise_squashing_key.key.polynomial_size();
+    let bsk_glwe_size = noise_squashing_key.key.glwe_size();
+
+    let u128_encoding = ShortintEncoding {
+        ciphertext_modulus: noise_squashing_params.ciphertext_modulus(),
+        message_modulus: noise_squashing_params.message_modulus(),
+        carry_modulus: noise_squashing_params.carry_modulus(),
+        padding_bit: PaddingBit::Yes,
+    };
+
+    let id_lut_cpu = generate_programmable_bootstrap_glwe_lut(
+        bsk_polynomial_size,
+        bsk_glwe_size,
+        u128_encoding
+            .cleartext_space_without_padding()
+            .try_into()
+            .unwrap(),
+        u128_encoding.ciphertext_modulus,
+        u128_encoding.delta(),
+        |x| x,
+    );
+    let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, streams);
+
+    let lwe_per_glwe = cuda_noise_squashing_compression_key.lwe_per_glwe;
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(msg)).collect();
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(streams, cuda_block_info))
+        .collect();
+
+    let input_zero_as_lwe: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]),
+                streams,
+            );
+            CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks)
+        })
+        .collect();
+
+    let (before_packing_gpu, after_packing_gpu) = dp_ks_any_ms_standard_pbs128_packing_ks(
+        input_zero_as_lwe,
+        scalar_for_multiplication,
+        cuda_sks,
+        modulus_switch_config,
+        cuda_noise_squashing_key,
+        br_input_modulus_log,
+        &id_lut_gpu,
+        &cuda_noise_squashing_compression_key.packing_key_switching_key,
+        &mut cuda_side_resources,
+    );
+
+    let before_packing: Vec<_> = before_packing_gpu
+        .into_iter()
+        .map(
+            |(
+                input_gpu,
+                after_dp_gpu,
+                after_ks_gpu,
+                after_drift_gpu,
+                after_ms_gpu,
+                after_pbs128_gpu,
+            )| {
+                match &cks.key.atomic_pattern {
+                    AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => {
+                        let params = standard_atomic_pattern_client_key.parameters;
+                        let u64_encoding = ShortintEncoding {
+                            ciphertext_modulus: params.ciphertext_modulus(),
+                            message_modulus: params.message_modulus(),
+                            carry_modulus: params.carry_modulus(),
+                            padding_bit: PaddingBit::Yes,
+                        };
+                        let large_lwe_secret_key =
+                            standard_atomic_pattern_client_key.large_lwe_secret_key();
+                        let small_lwe_secret_key =
+                            standard_atomic_pattern_client_key.small_lwe_secret_key();
+
+                        let input_ct = input_gpu.as_ct_64_cpu(streams);
+                        let after_dp_ct = after_dp_gpu.as_ct_64_cpu(streams);
+                        let after_ks_ct = after_ks_gpu.as_ct_64_cpu(streams);
+                        let before_ms_gpu: &CudaDynLwe =
+                            after_drift_gpu.as_ref().unwrap_or(&after_ks_gpu);
+                        let before_ms_ct = before_ms_gpu.as_ct_64_cpu(streams);
+                        let after_ms_ct = after_ms_gpu.as_ct_64_cpu(streams);
+                        let after_pbs128_ct = after_pbs128_gpu.as_ct_128_cpu(streams);
+                        (
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &input_ct,
+                                &large_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &after_dp_ct,
+                                &large_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &after_ks_ct,
+                                &small_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &before_ms_ct,
+                                &small_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &after_ms_ct,
+                                &small_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &after_pbs128_ct,
+                                &noise_squashing_private_key
+                                    .key
+                                    .post_noise_squashing_lwe_secret_key(),
+                                msg.into(),
+                                &u128_encoding,
+                            ),
+                        )
+                    }
+                    AtomicPatternClientKey::KeySwitch32(_ks32_atomic_pattern_client_key) => {
+                        panic!("KS32 atomic pattern not supported for GPU yet");
+                    }
+                }
+            },
+        )
+        .collect();
+    let after_packing_list = after_packing_gpu.to_glwe_ciphertext_list(streams);
+    let after_packing = GlweCiphertext::from_container(
+        after_packing_list.clone().into_container(),
+        after_packing_list.polynomial_size(),
+        after_packing_list.ciphertext_modulus(),
+    );
+    let after_packing = DecryptionAndNoiseResult::new_from_glwe(
+        &after_packing,
+        noise_squashing_compression_private_key
+            .key
+            .post_packing_ks_key(),
+        lwe_per_glwe,
+        msg.into(),
+        &u128_encoding,
+    );
+
+    assert_eq!(after_packing.len(), lwe_per_glwe.0);
+
+    (before_packing, after_packing)
+}
+
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+fn encrypt_dp_ks_standard_pbs128_packing_ks_noise_helper_gpu(
+    params: AtomicPatternParameters,
+    noise_squashing_params: NoiseSquashingParameters,
+    noise_squashing_compression_params: NoiseSquashingCompressionParameters,
+    single_cks: &crate::integer::ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_noise_squashing_private_key: &NoiseSquashingPrivateKey,
+    single_noise_squashing_key: &crate::integer::noise_squashing::NoiseSquashingKey,
+    single_cuda_noise_squashing_key: &CudaNoiseSquashingKey,
+    single_noise_squashing_compression_private_key: &NoiseSquashingCompressionPrivateKey,
+    single_cuda_noise_squashing_compression_key: &CudaNoiseSquashingCompressionKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    Vec<(
+        NoiseSample,
+        NoiseSample,
+        NoiseSample,
+        NoiseSample,
+        NoiseSample,
+        NoiseSample,
+    )>,
+    Vec<NoiseSample>,
+) {
+    let (before_compression, after_compression) =
+        encrypt_dp_ks_standard_pbs128_packing_ks_inner_helper_gpu(
+            params,
+            noise_squashing_params,
+            noise_squashing_compression_params,
+            single_cks,
+            single_cuda_sks,
+            single_noise_squashing_private_key,
+            single_noise_squashing_key,
+            single_cuda_noise_squashing_key,
+            single_noise_squashing_compression_private_key,
+            single_cuda_noise_squashing_compression_key,
+            msg,
+            scalar_for_multiplication,
+            br_input_modulus_log,
+            streams,
+        );
+
+    (
+        before_compression
+            .into_iter()
+            .map(
+                |(input, after_dp, after_ks, after_drift, after_ms, after_pbs)| {
+                    (
+                        input
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_dp
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_ks
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_drift
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_ms
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_pbs
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                    )
+                },
+            )
+            .collect(),
+        after_compression
+            .into_iter()
+            .map(|after_compression| {
+                after_compression
+                    .get_noise_if_decryption_was_correct()
+                    .expect("Decryption Failed")
+            })
+            .collect(),
+    )
+}
+
+fn noise_check_encrypt_dp_ks_standard_pbs128_packing_ks_noise_gpu(meta_params: MetaParameters) {
+    let (atomic_params, noise_squashing_params, noise_squashing_compression_params) = {
+        let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap();
+        (
+            meta_params.compute_parameters,
+            meta_noise_squashing_params.parameters,
+            meta_noise_squashing_params.compression_parameters.unwrap(),
+        )
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = atomic_params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params);
+    let compressed_noise_squashing_compression_key =
+        cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
+    let noise_squashing_key = compressed_noise_squashing_compression_key.decompress();
+    let cuda_noise_squashing_key =
+        compressed_noise_squashing_compression_key.decompress_to_cuda(&streams);
+    let noise_squashing_compression_private_key =
+        NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params);
+    let noise_squashing_compression_key = noise_squashing_private_key
+        .new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
+    let cuda_noise_squashing_compression_key =
+        CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
+            &noise_squashing_compression_key,
+            &streams,
+        );
+
+    let noise_simulation_ksk =
+        NoiseSimulationLweKeyswitchKey::new_from_atomic_pattern_parameters(atomic_params);
+    let noise_simulation_bsk =
+        NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(atomic_params);
+    let noise_simulation_modulus_switch_config =
+        NoiseSimulationModulusSwitchConfig::new_from_atomic_pattern_parameters(atomic_params);
+    let noise_simulation_bsk128 =
+        NoiseSimulationLweFourier128Bsk::new_from_parameters(atomic_params, noise_squashing_params);
+    let noise_simulation_packing_key =
+        NoiseSimulationLwePackingKeyswitchKey::new_from_noise_squashing_parameters(
+            noise_squashing_params,
+            noise_squashing_compression_params,
+        );
+
+    assert!(noise_simulation_bsk.matches_actual_bsk_gpu(&cuda_sks.bootstrapping_key));
+
+    assert!(noise_simulation_bsk128
+        .matches_actual_shortint_noise_squashing_key(&noise_squashing_key.key));
+    assert!(noise_simulation_packing_key.matches_actual_pksk(
+        noise_squashing_compression_key
+            .key
+            .packing_key_switching_key()
+    ));
+
+    let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log();
+
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let noise_simulation_accumulator = NoiseSimulationGlwe::new(
+        noise_simulation_bsk128
+            .output_glwe_size()
+            .to_glwe_dimension(),
+        noise_simulation_bsk128.output_polynomial_size(),
+        Variance(0.0),
+        noise_simulation_bsk128.modulus(),
+    );
+
+    let (_before_packing_sim, after_packing_sim) = {
+        let noise_simulation = NoiseSimulationLwe::encrypt(&cks.key, 0);
+        dp_ks_any_ms_standard_pbs128_packing_ks(
+            vec![noise_simulation; cuda_noise_squashing_compression_key.lwe_per_glwe.0],
+            max_scalar_mul,
+            &noise_simulation_ksk,
+            noise_simulation_modulus_switch_config.as_ref(),
+            &noise_simulation_bsk128,
+            br_input_modulus_log,
+            &noise_simulation_accumulator,
+            &noise_simulation_packing_key,
+            &mut vec![(); cuda_noise_squashing_compression_key.lwe_per_glwe.0],
+        )
+    };
+
+    let after_packing_sim = after_packing_sim.into_lwe();
+
+    // Check that the circuit is correct with respect to core implementation, i.e. does not crash on
+    // dimension checks
+    let (expected_lwe_dimension_out, expected_modulus_f64_out) = {
+        let pksk = noise_squashing_compression_key
+            .key
+            .packing_key_switching_key();
+
+        let out_glwe_dim = pksk.output_key_glwe_dimension();
+        let out_poly_size = pksk.output_key_polynomial_size();
+
+        (
+            out_glwe_dim.to_equivalent_lwe_dimension(out_poly_size),
+            pksk.ciphertext_modulus().raw_modulus_float(),
+        )
+    };
+
+    assert_eq!(
+        after_packing_sim.lwe_dimension(),
+        expected_lwe_dimension_out
+    );
+    assert_eq!(
+        after_packing_sim.modulus().as_f64(),
+        expected_modulus_f64_out
+    );
+
+    let cleartext_modulus = atomic_params.message_modulus().0 * atomic_params.carry_modulus().0;
+    let mut noise_samples_after_packing = vec![];
+
+    let sample_count_per_msg =
+        1000usize.div_ceil(cuda_noise_squashing_compression_key.lwe_per_glwe.0);
+    let chunk_size = 4;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    for _i in 0..cleartext_modulus {
+        let current_noise_samples_after_packing: Vec<_> = (0..sample_count_per_msg)
+            .collect::<Vec<_>>()
+            .chunks(chunk_size)
+            .flat_map(|chunk| {
+                chunk
+                    .into_par_iter()
+                    .map(|i| {
+                        let local_stream = &vec_local_streams[*i % chunk_size];
+                        let (_before_packing, after_packing) =
+                            encrypt_dp_ks_standard_pbs128_packing_ks_noise_helper_gpu(
+                                atomic_params,
+                                noise_squashing_params,
+                                noise_squashing_compression_params,
+                                &cks,
+                                &cuda_sks,
+                                &noise_squashing_private_key,
+                                &noise_squashing_key,
+                                &cuda_noise_squashing_key,
+                                &noise_squashing_compression_private_key,
+                                &cuda_noise_squashing_compression_key,
+                                0,
+                                max_scalar_mul,
+                                br_input_modulus_log,
+                                local_stream,
+                            );
+                        after_packing
+                    })
+                    .collect::<Vec<_>>()
+            })
+            .collect();
+
+        noise_samples_after_packing.extend(current_noise_samples_after_packing);
+    }
+
+    let noise_samples_after_packing_flattened: Vec<_> = noise_samples_after_packing
+        .into_iter()
+        .flatten()
+        .map(|x| x.value)
+        .collect();
+
+    let after_packing_is_ok = mean_and_variance_check(
+        &noise_samples_after_packing_flattened,
+        "after_packing",
+        0.0,
+        after_packing_sim.variance(),
+        noise_squashing_compression_params.packing_ks_key_noise_distribution,
+        after_packing_sim.lwe_dimension(),
+        after_packing_sim.modulus().as_f64(),
+    );
+
+    assert!(after_packing_is_ok);
+}
+
+create_gpu_parameterized_test!(
+    noise_check_encrypt_dp_ks_standard_pbs128_packing_ks_noise_gpu {
+        TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+    }
+);
+
+create_gpu_parameterized_test!(sanity_check_encrypt_dp_ks_standard_pbs128_packing_ks_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+create_gpu_parameterized_test!(sanity_check_encrypt_dp_ks_standard_pbs128_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
@@ -1,3 +1,5 @@
 pub mod br_dp_ks_ms;
+pub mod br_dp_packingks_ms;
 pub mod dp_ks_ms;
+pub mod dp_ks_pbs_128_packingks;
 pub mod utils;
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs
@@ -1,7 +1,7 @@
 use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::{
    AllocateCenteredBinaryShiftedStandardModSwitchResult,
    AllocateDriftTechniqueStandardModSwitchResult, AllocateLweBootstrapResult,
-    AllocateLweKeyswitchResult, AllocateStandardModSwitchResult,
+    AllocateLweKeyswitchResult, AllocateLwePackingKeyswitchResult, AllocateStandardModSwitchResult,
    CenteredBinaryShiftedStandardModSwitch, DriftTechniqueStandardModSwitch,
    LweClassicFftBootstrap, LweKeyswitch, ScalarMul, StandardModSwitch,
 };
@@ -13,6 +13,7 @@ use crate::core_crypto::gpu::cuda_modulus_switch_ciphertext;
 use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
 use crate::core_crypto::gpu::lwe_bootstrap_key::CudaModulusSwitchNoiseReductionConfiguration;
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::prelude::*;
 use crate::integer::gpu::ciphertext::info::CudaBlockInfo;
@@ -25,7 +26,7 @@ use crate::integer::gpu::{
    cuda_centered_modulus_switch_64, unchecked_small_scalar_mul_integer_async, CudaStreams,
 };
 use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::NoiseSimulationModulusSwitchConfig;
-
+use crate::shortint::server_key::tests::noise_distribution::utils::traits::LwePackingKeyswitch;
 /// Side resources for CUDA operations in noise simulation
 #[derive(Clone)]
 pub struct CudaSideResources {
@@ -128,6 +129,19 @@ impl CudaDynLwe {
        }
    }

+    pub fn as_ct_128_cpu(&self, streams: &CudaStreams) -> LweCiphertext<Vec<u128>> {
+        match self {
+            Self::U32(_) => panic!("Tried getting a u32 CudaLweCiphertextList as u128."),
+            Self::U64(_) => panic!("Tried getting a u64 CudaLweCiphertextList as u128."),
+            Self::U128(_cuda_lwe) => {
+                let cpu_lwe_list = self.as_lwe_128().to_lwe_ciphertext_list(streams);
+                LweCiphertext::from_container(
+                    cpu_lwe_list.clone().into_container(),
+                    cpu_lwe_list.ciphertext_modulus(),
+                )
+            }
+        }
+    }
    pub fn from_lwe_32(cuda_lwe: CudaLweCiphertextList<u32>) -> Self {
        Self::U32(cuda_lwe)
    }
@@ -141,6 +155,19 @@ impl CudaDynLwe {
    }
 }

+/// Converts a CudaGlweCiphertextList<u64> to a GlweCiphertext<Vec<u64>>
+pub fn cuda_glwe_list_to_glwe_ciphertext(
+    cuda_glwe_list: &CudaGlweCiphertextList<u64>,
+    streams: &CudaStreams,
+) -> GlweCiphertext<Vec<u64>> {
+    let cpu_glwe_list = cuda_glwe_list.to_glwe_ciphertext_list(streams);
+    GlweCiphertext::from_container(
+        cpu_glwe_list.clone().into_container(),
+        cpu_glwe_list.polynomial_size(),
+        cpu_glwe_list.ciphertext_modulus(),
+    )
+}
+
 impl ScalarMul<u64> for CudaDynLwe {
    type Output = Self;
    type SideResources = CudaSideResources;
@@ -313,13 +340,14 @@ impl StandardModSwitch<Self> for CudaDynLwe {
                panic!("U32 modulus switch not implemented for CudaDynLwe - only U64 is supported");
            }
            (Self::U64(input), Self::U64(output_cuda_lwe)) => {
-                let internal_output = input.duplicate(&side_resources.streams);
+                let mut internal_output = input.duplicate(&side_resources.streams);
                cuda_modulus_switch_ciphertext(
-                    &mut output_cuda_lwe.0.d_vec,
+                    &mut internal_output.0.d_vec,
                    output_modulus_log.0 as u32,
                    &side_resources.streams,
                );
                let mut cpu_lwe = internal_output.to_lwe_ciphertext_list(&side_resources.streams);
+
                let shift_to_map_to_native = u64::BITS - output_modulus_log.0 as u32;
                for val in cpu_lwe.as_mut_view().into_container().iter_mut() {
                    *val <<= shift_to_map_to_native;
@@ -713,3 +741,193 @@ impl AllocateLweBootstrapResult for CudaGlweCiphertextList<u128> {
        CudaDynLwe::U128(cuda_lwe)
    }
 }
+
+// Implement LweClassicFft128Bootstrap for CudaNoiseSquashingKey using 128-bit PBS CUDA function
+impl
+    crate::core_crypto::commons::noise_formulas::noise_simulation::traits::LweClassicFft128Bootstrap<
+        CudaDynLwe,
+        CudaDynLwe,
+        CudaGlweCiphertextList<u128>,
+    > for crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey
+{
+    type SideResources = CudaSideResources;
+
+    fn lwe_classic_fft_128_pbs(
+        &self,
+        input: &CudaDynLwe,
+        output: &mut CudaDynLwe,
+        accumulator: &CudaGlweCiphertextList<u128>,
+        side_resources: &mut Self::SideResources,
+    ) {
+        use crate::core_crypto::gpu::algorithms::lwe_programmable_bootstrapping::cuda_programmable_bootstrap_128_lwe_ciphertext_async;
+        use crate::integer::gpu::server_key::CudaBootstrappingKey;
+
+        match (input, output) {
+            (CudaDynLwe::U64(input_cuda_lwe), CudaDynLwe::U128(output_cuda_lwe)) => {
+                // Get the bootstrap key from self - it's already u128 type
+                let bsk = match &self.bootstrapping_key {
+                    CudaBootstrappingKey::Classic(d_bsk) => d_bsk,
+                    CudaBootstrappingKey::MultiBit(_) => {
+                        panic!("MultiBit bootstrapping keys are not supported for 128-bit PBS");
+                    }
+                };
+
+                unsafe {
+                    cuda_programmable_bootstrap_128_lwe_ciphertext_async(
+                        input_cuda_lwe,
+                        output_cuda_lwe,
+                        accumulator,
+                        bsk,
+                        &side_resources.streams,
+                    );
+                    side_resources.streams.synchronize();
+                }
+            }
+            _ => panic!("128-bit PBS expects U64 input and U128 output for CudaDynLwe"),
+        }
+    }
+}
+
+impl AllocateLwePackingKeyswitchResult for CudaLwePackingKeyswitchKey<u64> {
+    type Output = CudaGlweCiphertextList<u64>;
+    type SideResources = CudaSideResources;
+
+    fn allocate_lwe_packing_keyswitch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        let glwe_dimension = self.output_glwe_size().to_glwe_dimension();
+        let polynomial_size = self.output_polynomial_size();
+        let ciphertext_modulus = self.ciphertext_modulus();
+
+        CudaGlweCiphertextList::new(
+            glwe_dimension,
+            polynomial_size,
+            GlweCiphertextCount(1),
+            ciphertext_modulus,
+            &side_resources.streams,
+        )
+    }
+}
+
+impl LwePackingKeyswitch<[&CudaDynLwe], CudaGlweCiphertextList<u64>>
+    for CudaLwePackingKeyswitchKey<u64>
+{
+    type SideResources = CudaSideResources;
+
+    fn keyswitch_lwes_and_pack_in_glwe(
+        &self,
+        input: &[&CudaDynLwe],
+        output: &mut CudaGlweCiphertextList<u64>,
+        side_resources: &mut CudaSideResources,
+    ) {
+        use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64;
+        let input_lwe_ciphertext_list = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list(
+            input.iter().map(|ciphertext| ciphertext.as_lwe_64()),
+            &side_resources.streams,
+        );
+
+        cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
+            self,
+            &input_lwe_ciphertext_list,
+            output,
+            &side_resources.streams,
+        );
+    }
+}
+
+// Implement StandardModSwitch traits for CudaGlweCiphertextList<u64>
+impl AllocateStandardModSwitchResult for CudaGlweCiphertextList<u64> {
+    type Output = Self;
+    type SideResources = CudaSideResources;
+
+    fn allocate_standard_mod_switch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        Self::new(
+            self.glwe_dimension(),
+            self.polynomial_size(),
+            self.glwe_ciphertext_count(),
+            self.ciphertext_modulus(),
+            &side_resources.streams,
+        )
+    }
+}
+
+impl StandardModSwitch<Self> for CudaGlweCiphertextList<u64> {
+    type SideResources = CudaSideResources;
+
+    fn standard_mod_switch(
+        &self,
+        storage_log_modulus: CiphertextModulusLog,
+        output: &mut Self,
+        side_resources: &mut CudaSideResources,
+    ) {
+        let mut internal_output = self.duplicate(&side_resources.streams);
+
+        cuda_modulus_switch_ciphertext(
+            &mut internal_output.0.d_vec,
+            storage_log_modulus.0 as u32,
+            &side_resources.streams,
+        );
+        side_resources.streams.synchronize();
+        let mut cpu_glwe = internal_output.to_glwe_ciphertext_list(&side_resources.streams);
+
+        let shift_to_map_to_native = u64::BITS - storage_log_modulus.0 as u32;
+        for val in cpu_glwe.as_mut_view().into_container().iter_mut() {
+            *val <<= shift_to_map_to_native;
+        }
+        let d_after_ms = Self::from_glwe_ciphertext_list(&cpu_glwe, &side_resources.streams);
+
+        *output = d_after_ms;
+    }
+}
+
+impl AllocateLwePackingKeyswitchResult for CudaLwePackingKeyswitchKey<u128> {
+    type Output = CudaGlweCiphertextList<u128>;
+    type SideResources = CudaSideResources;
+
+    fn allocate_lwe_packing_keyswitch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        let glwe_dimension = self.output_glwe_size().to_glwe_dimension();
+        let polynomial_size = self.output_polynomial_size();
+        let ciphertext_modulus = self.ciphertext_modulus();
+
+        CudaGlweCiphertextList::new(
+            glwe_dimension,
+            polynomial_size,
+            GlweCiphertextCount(1),
+            ciphertext_modulus,
+            &side_resources.streams,
+        )
+    }
+}
+
+impl LwePackingKeyswitch<[&CudaDynLwe], CudaGlweCiphertextList<u128>>
+    for CudaLwePackingKeyswitchKey<u128>
+{
+    type SideResources = CudaSideResources;
+
+    fn keyswitch_lwes_and_pack_in_glwe(
+        &self,
+        input: &[&CudaDynLwe],
+        output: &mut CudaGlweCiphertextList<u128>,
+        side_resources: &mut CudaSideResources,
+    ) {
+        use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_128;
+        let input_lwe_ciphertext_list = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list(
+            input.iter().map(|ciphertext| ciphertext.as_lwe_128()),
+            &side_resources.streams,
+        );
+
+        cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_128(
+            self,
+            &input_lwe_ciphertext_list,
+            output,
+            &side_resources.streams,
+        );
+    }
+}
--- a/tfhe/src/shortint/server_key/tests/noise_distribution/dp_ks_pbs128_packingks.rs
+++ b/tfhe/src/shortint/server_key/tests/noise_distribution/dp_ks_pbs128_packingks.rs
@@ -27,7 +27,7 @@ use crate::shortint::server_key::ServerKey;
 use rayon::prelude::*;

 #[allow(clippy::too_many_arguments)]
-fn dp_ks_any_ms_standard_pbs128<
+pub fn dp_ks_any_ms_standard_pbs128<
    InputCt,
    ScalarMulResult,
    KsResult,
@@ -111,7 +111,7 @@ where

 #[allow(clippy::too_many_arguments)]
 #[allow(clippy::type_complexity)]
-fn dp_ks_any_ms_standard_pbs128_packing_ks<
+pub fn dp_ks_any_ms_standard_pbs128_packing_ks<
    InputCt,
    ScalarMulResult,
    KsResult,