feat(gpu): create noise and pfail tests for pbs + ks + ms

2026-01-09 22:57:59 -05:00 · 2025-09-23 19:46:39 +02:00
parent b7a706a3db
commit 11579bd3d0
19 changed files with 1920 additions and 51 deletions
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
@@ -60,20 +60,11 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
        accumulator.polynomial_size(),
        bsk.polynomial_size(),
    );
-
    assert_eq!(
-        input.ciphertext_modulus(),
        output.ciphertext_modulus(),
-        "Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
-        input.ciphertext_modulus(),
-        output.ciphertext_modulus(),
-    );
-
-    assert_eq!(
-        input.ciphertext_modulus(),
        accumulator.ciphertext_modulus(),
-        "Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
-        input.ciphertext_modulus(),
+        "Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
+        output.ciphertext_modulus(),
        accumulator.ciphertext_modulus(),
    );
    assert_eq!(
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -831,6 +831,19 @@ pub unsafe fn cuda_modulus_switch_ciphertext_async<T: UnsignedInteger>(
    );
 }

+pub fn cuda_modulus_switch_ciphertext<Scalar>(
+    output_lwe_ciphertext: &mut CudaVec<Scalar>,
+    log_modulus: u32,
+    streams: &CudaStreams,
+) where
+    Scalar: UnsignedInteger,
+{
+    unsafe {
+        cuda_modulus_switch_ciphertext_async(streams, output_lwe_ciphertext, log_modulus);
+    }
+    streams.synchronize();
+}
+
 /// Addition of a vector of LWE ciphertexts
 ///
 /// # Safety
--- a/tfhe/src/integer/gpu/ciphertext/mod.rs
+++ b/tfhe/src/integer/gpu/ciphertext/mod.rs
@@ -10,12 +10,12 @@ use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::{LweCiphertextList, LweCiphertextOwned};
-use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
 use crate::integer::parameters::LweDimension;
 use crate::integer::{IntegerCiphertext, RadixCiphertext, SignedRadixCiphertext};
 use crate::shortint::{Ciphertext, EncryptionKeyChoice};
 use crate::GpuIndex;

+use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
 pub use compressed_noise_squashed_ciphertext_list::*;

 pub trait CudaIntegerRadixCiphertext: Sized {
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -10357,3 +10357,41 @@ pub(crate) unsafe fn cuda_backend_cast_to_signed<T: UnsignedInteger, B: Numeric>

    update_noise_degree(output, &cuda_ffi_output);
 }
+
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn unchecked_small_scalar_mul_integer_async(
+    streams: &CudaStreams,
+    lwe_array: &mut CudaRadixCiphertext,
+    small_scalar: u64,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array.d_blocks.0.d_vec.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut lwe_array_degrees = lwe_array.info.blocks.iter().map(|b| b.degree.0).collect();
+    let mut lwe_array_noise_levels = lwe_array
+        .info
+        .blocks
+        .iter()
+        .map(|b| b.noise_level.0)
+        .collect();
+    let mut cuda_ffi_lwe_array = prepare_cuda_radix_ffi(
+        lwe_array,
+        &mut lwe_array_degrees,
+        &mut lwe_array_noise_levels,
+    );
+
+    cuda_small_scalar_multiplication_integer_64_inplace(
+        streams.ffi(),
+        &raw mut cuda_ffi_lwe_array,
+        small_scalar,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+    );
+}
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -61,6 +61,8 @@ mod aes256;
 #[cfg(test)]
 mod tests_long_run;
 #[cfg(test)]
+mod tests_noise_distribution;
+#[cfg(test)]
 mod tests_signed;
 #[cfg(test)]
 mod tests_unsigned;
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_ks_ms.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_ks_ms.rs
@@ -0,0 +1,586 @@
+use super::utils::noise_simulation::{CudaDynLwe, CudaSideResources};
+use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::{
+    AllocateLweBootstrapResult, LweClassicFftBootstrap,
+};
+use crate::core_crypto::commons::parameters::CiphertextModulusLog;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::GpuIndex;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::LweCiphertext;
+use crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
+use crate::integer::gpu::server_key::radix::tests_unsigned::create_gpu_parameterized_test;
+use crate::integer::gpu::server_key::radix::CudaBlockInfo;
+use crate::integer::gpu::server_key::CudaServerKey;
+use crate::integer::gpu::unchecked_small_scalar_mul_integer_async;
+use crate::integer::{CompressedServerKey, IntegerCiphertext};
+use crate::shortint::ciphertext::NoiseLevel;
+use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey;
+use crate::shortint::encoding::{PaddingBit, ShortintEncoding};
+
+use crate::shortint::parameters::test_params::{
+    TEST_META_PARAM_CPU_2_2_KS_PBS_GAUSSIAN_2M128,
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+};
+use crate::shortint::parameters::{AtomicPatternParameters, MetaParameters, Variance};
+use crate::shortint::server_key::tests::noise_distribution::br_dp_ks_ms::br_dp_ks_any_ms;
+use crate::shortint::server_key::tests::noise_distribution::should_use_single_key_debug;
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{
+    NoiseSimulationGlwe, NoiseSimulationLwe, NoiseSimulationLweFourierBsk,
+    NoiseSimulationLweKeyswitchKey, NoiseSimulationModulus,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::{
+    mean_and_variance_check, normality_check, pfail_check, update_ap_params_for_pfail,
+    DecryptionAndNoiseResult, NoiseSample, PfailTestMeta, PfailTestResult,
+};
+
+use crate::shortint::server_key::tests::noise_distribution::should_run_short_pfail_tests_debug;
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::NoiseSimulationModulusSwitchConfig;
+use crate::shortint::{CarryModulus, Ciphertext, ClientKey, ShortintParameterSet};
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+/// Test function to verify that the noise checking tools match the actual atomic patterns
+/// implemented in shortint for GPU
+fn sanity_check_encrypt_br_dp_ks_pbs_gpu(meta_params: MetaParameters) {
+    let atomic_params = meta_params.compute_parameters;
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = atomic_params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let modulus_switch_config = cuda_sks.noise_simulation_modulus_switch_config();
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams);
+
+    let block_info = CudaBlockInfo {
+        degree: crate::shortint::parameters::Degree::new(atomic_params.message_modulus().0 - 1),
+        message_modulus: atomic_params.message_modulus(),
+        carry_modulus: atomic_params.carry_modulus(),
+        atomic_pattern: atomic_params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+
+    let mut cuda_side_resources = CudaSideResources::new(&streams, block_info);
+
+    for _ in 0..10 {
+        let input_zero_as_lwe = cks
+            .key
+            .encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0);
+
+        let d_ct_input =
+            CudaLweCiphertextList::from_lwe_ciphertext(&input_zero_as_lwe.as_lwe_64(), &streams);
+        let gpu_sample_input = CudaDynLwe::U64(d_ct_input);
+
+        let (_input, d_input_pbs_result, _after_dp, _ks_result, _drift_technique_result, ms_result) =
+            br_dp_ks_any_ms(
+                gpu_sample_input,
+                &cuda_sks,
+                max_scalar_mul,
+                &cuda_sks,
+                modulus_switch_config,
+                &d_accumulator,
+                br_input_modulus_log,
+                &mut cuda_side_resources,
+            );
+
+        let mut output_pbs_result =
+            d_accumulator.allocate_lwe_bootstrap_result(&mut cuda_side_resources);
+        cuda_sks.lwe_classic_fft_pbs(
+            &ms_result,
+            &mut output_pbs_result,
+            &d_accumulator,
+            &mut cuda_side_resources,
+        );
+
+        let after_pbs_ct = output_pbs_result.as_ct_64_cpu(&cuda_side_resources.streams);
+        let input_pbs_result = d_input_pbs_result.as_ct_64_cpu(&cuda_side_resources.streams);
+
+        // Shortint APIs are not granular enough to compare ciphertexts at the MS level
+        // and inject arbitrary LWEs as input to the blind rotate step of the PBS.
+        // So we start with the output of the input PBS from our test case and finish after
+        // the second PBS and not the MS from our dedicated sanity function, which are
+        // boundaries that are easily reached with shortint.
+        // We don't want to use that dedicated function in statistical tests as it computes
+        // 2 PBSes instead of one, the output of the seoncd PBS being of no interest for
+        // noise measurement here.
+
+        let shortint_res = Ciphertext::new(
+            input_pbs_result,
+            id_lut.degree,
+            NoiseLevel::NOMINAL,
+            cuda_sks.message_modulus,
+            cuda_sks.carry_modulus,
+            atomic_params.atomic_pattern(),
+        );
+
+        let radix_ct = crate::integer::RadixCiphertext::from_blocks(vec![shortint_res]);
+        let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+            &radix_ct,
+            &cuda_side_resources.streams,
+        );
+
+        unsafe {
+            unchecked_small_scalar_mul_integer_async(
+                &cuda_side_resources.streams,
+                &mut d_ct.ciphertext,
+                max_scalar_mul,
+                atomic_params.message_modulus(),
+                atomic_params.carry_modulus(),
+            );
+        }
+        cuda_side_resources.streams.synchronize();
+
+        let mut after_pbs_shortint_ct: CudaUnsignedRadixCiphertext =
+            cuda_sks.create_trivial_zero_radix(1, &cuda_side_resources.streams);
+
+        cuda_sks.apply_lookup_table(
+            &mut after_pbs_shortint_ct.ciphertext,
+            &d_ct.ciphertext,
+            &id_lut,
+            0..1,
+            &cuda_side_resources.streams,
+        );
+
+        let shortint_res_list = after_pbs_shortint_ct
+            .ciphertext
+            .d_blocks
+            .to_lwe_ciphertext_list(&cuda_side_resources.streams);
+
+        let shortint_res_ct = LweCiphertext::from_container(
+            shortint_res_list.clone().into_container(),
+            shortint_res_list.ciphertext_modulus(),
+        );
+
+        assert_eq!(after_pbs_ct.as_view(), shortint_res_ct.as_view());
+    }
+}
+
+create_gpu_parameterized_test!(sanity_check_encrypt_br_dp_ks_pbs_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_GAUSSIAN_2M128,
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+fn encrypt_br_dp_ks_any_ms_inner_helper_gpu(
+    params: AtomicPatternParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+) {
+    let thread_cks: crate::integer::ClientKey;
+    let thread_cuda_sks: CudaServerKey;
+
+    let (cks, cuda_sks) = if should_use_single_key_debug() {
+        (single_cks, single_cuda_sks)
+    } else {
+        let block_params: ShortintParameterSet = params.into();
+        thread_cks = crate::integer::ClientKey::new(block_params);
+        let thread_compressed_server_key =
+            CompressedServerKey::new_radix_compressed_server_key(&thread_cks);
+        thread_cuda_sks =
+            CudaServerKey::decompress_from_cpu(&thread_compressed_server_key, streams);
+        (&thread_cks.key, &thread_cuda_sks)
+    };
+    let modulus_switch_config = cuda_sks.noise_simulation_modulus_switch_config();
+    let ct = cks.encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0);
+
+    let d_ct_lwe = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), streams);
+    let d_ct = CudaDynLwe::U64(d_ct_lwe);
+
+    let block_info = CudaBlockInfo {
+        degree: crate::shortint::parameters::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+
+    let mut cuda_side_resources = CudaSideResources::new(streams, block_info);
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, streams);
+
+    let (input_gpu, after_br_gpu, after_dp_gpu, after_ks_gpu, after_drift_gpu, after_ms_gpu) =
+        br_dp_ks_any_ms(
+            d_ct,
+            cuda_sks,
+            scalar_for_multiplication,
+            cuda_sks,
+            modulus_switch_config,
+            &d_accumulator,
+            br_input_modulus_log,
+            &mut cuda_side_resources,
+        );
+
+    let input_ct = input_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let after_br_ct = after_br_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let after_dp_ct = after_dp_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let after_ks_ct = after_ks_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let before_ms_gpu: &CudaDynLwe = after_drift_gpu.as_ref().unwrap_or(&after_ks_gpu);
+    let before_ms_ct = before_ms_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let after_ms_ct = after_ms_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+
+    let output_encoding = ShortintEncoding::from_parameters(params, PaddingBit::Yes);
+
+    match &cks.atomic_pattern {
+        AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => (
+            DecryptionAndNoiseResult::new_from_lwe(
+                &input_ct,
+                &standard_atomic_pattern_client_key.small_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &after_br_ct,
+                &standard_atomic_pattern_client_key.large_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &after_dp_ct,
+                &standard_atomic_pattern_client_key.large_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &after_ks_ct,
+                &standard_atomic_pattern_client_key.small_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &before_ms_ct,
+                &standard_atomic_pattern_client_key.small_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &after_ms_ct,
+                &standard_atomic_pattern_client_key.small_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+        ),
+        AtomicPatternClientKey::KeySwitch32(_) => todo!(),
+    }
+}
+
+fn encrypt_br_dp_ks_any_ms_noise_helper_gpu(
+    params: AtomicPatternParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    NoiseSample,
+    NoiseSample,
+    NoiseSample,
+    NoiseSample,
+    NoiseSample,
+    NoiseSample,
+) {
+    let (input, after_br, after_dp, after_ks, before_ms, after_ms) =
+        encrypt_br_dp_ks_any_ms_inner_helper_gpu(
+            params,
+            single_cks,
+            single_cuda_sks,
+            msg,
+            scalar_for_multiplication,
+            br_input_modulus_log,
+            streams,
+        );
+
+    (
+        input
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_br
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_dp
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_ks
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        before_ms
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_ms
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+    )
+}
+
+fn encrypt_br_dp_ks_any_ms_pfail_helper_gpu(
+    params: AtomicPatternParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> DecryptionAndNoiseResult {
+    let (_input, _after_br, _after_dp, _after_ks, _before_ms, after_ms) =
+        encrypt_br_dp_ks_any_ms_inner_helper_gpu(
+            params,
+            single_cks,
+            single_cuda_sks,
+            msg,
+            scalar_for_multiplication,
+            br_input_modulus_log,
+            streams,
+        );
+
+    after_ms
+}
+
+fn noise_check_encrypt_br_dp_ks_ms_noise(params: MetaParameters) {
+    let params: AtomicPatternParameters = params.compute_parameters;
+
+    let noise_simulation_ksk =
+        NoiseSimulationLweKeyswitchKey::new_from_atomic_pattern_parameters(params);
+    let noise_simulation_modulus_switch_config =
+        NoiseSimulationModulusSwitchConfig::new_from_atomic_pattern_parameters(params);
+    let noise_simulation_bsk =
+        NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(params);
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let modulus_switch_config_gpu = cuda_sks.noise_simulation_modulus_switch_config();
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+    let expected_average_after_ms =
+        modulus_switch_config_gpu.expected_average_after_ms(params.polynomial_size());
+
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let (_input_sim, _after_br_sim, _after_dp_sim, _after_ks_sim, _after_drift_sim, after_ms_sim) = {
+        // Noiseless LWE already mod switched is the input of the AP for testing
+        let noise_simulation = NoiseSimulationLwe::new(
+            noise_simulation_bsk.input_lwe_dimension(),
+            Variance(0.0),
+            NoiseSimulationModulus::Other(1 << br_input_modulus_log.0),
+        );
+        let noise_simulation_accumulator = NoiseSimulationGlwe::new(
+            noise_simulation_bsk.output_glwe_size().to_glwe_dimension(),
+            noise_simulation_bsk.output_polynomial_size(),
+            Variance(0.0),
+            noise_simulation_bsk.modulus(),
+        );
+        br_dp_ks_any_ms(
+            noise_simulation,
+            &noise_simulation_bsk,
+            max_scalar_mul,
+            &noise_simulation_ksk,
+            noise_simulation_modulus_switch_config.as_ref(),
+            &noise_simulation_accumulator,
+            br_input_modulus_log,
+            &mut (),
+        )
+    };
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams);
+    let sample_input = cks
+        .key
+        .encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0);
+
+    let d_ct_input =
+        CudaLweCiphertextList::from_lwe_ciphertext(&sample_input.as_lwe_64(), &streams);
+    let gpu_sample_input = CudaDynLwe::U64(d_ct_input);
+
+    let block_info = CudaBlockInfo {
+        degree: crate::shortint::parameters::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut side_resources = CudaSideResources::new(&streams, block_info);
+
+    // Check that the circuit is correct with respect to core implementation, i.e. does not crash on
+    // dimension checks
+    let (expected_lwe_dimension_out, expected_modulus_f64_out) = {
+        let (_input, _after_br, _after_dp, _after_ks, _before_ms, after_ms) = br_dp_ks_any_ms(
+            gpu_sample_input,
+            &cuda_sks,
+            max_scalar_mul,
+            &cuda_sks,
+            modulus_switch_config_gpu,
+            &d_accumulator,
+            br_input_modulus_log,
+            &mut side_resources,
+        );
+
+        (after_ms.lwe_dimension(), after_ms.raw_modulus_float())
+    };
+
+    assert_eq!(after_ms_sim.lwe_dimension(), expected_lwe_dimension_out);
+    assert_eq!(after_ms_sim.modulus().as_f64(), expected_modulus_f64_out);
+
+    let cleartext_modulus = params.message_modulus().0 * params.carry_modulus().0;
+    let mut noise_samples_before_ms = vec![];
+    let mut noise_samples_after_ms = vec![];
+
+    let sample_count_per_msg = 1000usize;
+
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    for _ in 0..cleartext_modulus {
+        let (current_noise_sample_before_ms, current_noise_samples_after_ms): (Vec<_>, Vec<_>) = (0
+            ..sample_count_per_msg)
+            .collect::<Vec<_>>()
+            .chunks(chunk_size)
+            .flat_map(|chunk| {
+                chunk
+                    .into_par_iter()
+                    .map(|i| {
+                        let local_stream = &vec_local_streams[*i % chunk_size];
+                        let (_input, _after_br, _after_dp, _after_ks, before_ms, after_ms) =
+                            encrypt_br_dp_ks_any_ms_noise_helper_gpu(
+                                params,
+                                &cks.key,
+                                &cuda_sks,
+                                0,
+                                max_scalar_mul,
+                                br_input_modulus_log,
+                                local_stream,
+                            );
+                        (before_ms.value, after_ms.value)
+                    })
+                    .collect::<Vec<_>>()
+            })
+            .unzip();
+
+        noise_samples_before_ms.extend(current_noise_sample_before_ms);
+        noise_samples_after_ms.extend(current_noise_samples_after_ms);
+    }
+    let before_ms_normality = normality_check(&noise_samples_before_ms, "before ms", 0.01);
+
+    let after_ms_is_ok = mean_and_variance_check(
+        &noise_samples_after_ms,
+        "after_ms",
+        expected_average_after_ms,
+        after_ms_sim.variance(),
+        params.lwe_noise_distribution(),
+        after_ms_sim.lwe_dimension(),
+        after_ms_sim.modulus().as_f64(),
+    );
+
+    assert!(before_ms_normality.null_hypothesis_is_valid && after_ms_is_ok);
+}
+
+create_gpu_parameterized_test!(noise_check_encrypt_br_dp_ks_ms_noise {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_GAUSSIAN_2M128,
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+fn noise_check_encrypt_br_dp_ks_ms_pfail_gpu(meta_params: MetaParameters) {
+    let (pfail_test_meta, params) = {
+        let mut ap_params: AtomicPatternParameters = meta_params.compute_parameters;
+
+        let original_message_modulus = ap_params.message_modulus();
+        let original_carry_modulus = ap_params.carry_modulus();
+
+        // For now only allow 2_2 parameters, and see later for heuristics to use
+        assert_eq!(original_message_modulus.0, 4);
+        assert_eq!(original_carry_modulus.0, 4);
+
+        // Update parameters to fail more frequently by inflating the carry modulus, allows to keep
+        // the max multiplication without risks of message overflow
+        let (original_pfail_and_precision, new_expected_pfail_and_precision) =
+            update_ap_params_for_pfail(
+                &mut ap_params,
+                original_message_modulus,
+                CarryModulus(1 << 5),
+            );
+
+        let pfail_test_meta = if should_run_short_pfail_tests_debug() {
+            let expected_fails = 200;
+            PfailTestMeta::new_with_desired_expected_fails(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                expected_fails,
+            )
+        } else {
+            let total_runs = 1_000_000;
+            PfailTestMeta::new_with_total_runs(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                total_runs,
+            )
+        };
+
+        (pfail_test_meta, ap_params)
+    };
+
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+
+    let total_runs_for_expected_fails = pfail_test_meta.total_runs_for_expected_fails();
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    let measured_fails: f64 = (0..total_runs_for_expected_fails)
+        .collect::<Vec<_>>()
+        .chunks(chunk_size)
+        .flat_map(|chunk| {
+            chunk
+                .into_par_iter()
+                .map(|i| {
+                    let local_streams = &vec_local_streams[*i as usize % chunk_size];
+                    let after_ms_decryption_result = encrypt_br_dp_ks_any_ms_pfail_helper_gpu(
+                        params,
+                        &cks.key,
+                        &cuda_sks,
+                        0,
+                        max_scalar_mul,
+                        br_input_modulus_log,
+                        local_streams,
+                    );
+                    after_ms_decryption_result.failure_as_f64()
+                })
+                .collect::<Vec<_>>()
+        })
+        .sum();
+    let test_result = PfailTestResult { measured_fails };
+
+    pfail_check(&pfail_test_meta, test_result);
+}
+
+create_gpu_parameterized_test!(noise_check_encrypt_br_dp_ks_ms_pfail_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_GAUSSIAN_2M128,
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_ms.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_ms.rs
@@ -0,0 +1,540 @@
+use super::utils::noise_simulation::CudaSideResources;
+use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::{
+    AllocateLweBootstrapResult, LweClassicFftBootstrap,
+};
+use crate::core_crypto::commons::parameters::CiphertextModulusLog;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::GpuIndex;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
+use crate::integer::gpu::server_key::radix::CudaBlockInfo;
+use crate::integer::gpu::server_key::CudaServerKey;
+use crate::integer::IntegerCiphertext;
+use crate::shortint::encoding::{PaddingBit, ShortintEncoding};
+
+use crate::core_crypto::commons::numeric::Numeric;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::prelude::LweCiphertext;
+use crate::integer::gpu::server_key::radix::tests_noise_distribution::utils::noise_simulation::CudaDynLwe;
+use crate::integer::gpu::server_key::radix::tests_unsigned::create_gpu_parameterized_test;
+use crate::integer::gpu::{unchecked_small_scalar_mul_integer_async, CastInto, CudaVec};
+use crate::integer::CompressedServerKey;
+use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey;
+use crate::shortint::parameters::test_params::{
+    TEST_META_PARAM_CPU_2_2_KS_PBS_GAUSSIAN_2M128,
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+};
+use crate::shortint::parameters::{AtomicPatternParameters, MetaParameters};
+use crate::shortint::server_key::tests::noise_distribution::dp_ks_ms::dp_ks_any_ms;
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{
+    NoiseSimulationLwe, NoiseSimulationLweKeyswitchKey, NoiseSimulationModulusSwitchConfig,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::{
+    mean_and_variance_check, normality_check, pfail_check, update_ap_params_for_pfail,
+    DecryptionAndNoiseResult, NoiseSample, PfailTestMeta, PfailTestResult,
+};
+use crate::shortint::server_key::tests::noise_distribution::{
+    should_run_short_pfail_tests_debug, should_use_single_key_debug,
+};
+use crate::shortint::{ClientKey, ShortintParameterSet};
+use itertools::Itertools;
+use rayon::prelude::*;
+
+/// Test function to verify that the noise checking tools match the actual atomic patterns
+/// implemented in shortint
+fn sanity_check_encrypt_dp_ks_pbs_gpu(meta_params: MetaParameters) {
+    let block_params = meta_params.compute_parameters;
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams);
+
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+    let modulus_switch_config = cuda_sks.noise_simulation_modulus_switch_config();
+
+    // Need to generate the required indexes for the PBS
+    let num_ct_blocks = 1;
+    let mut lut_vector_indexes: Vec<u64> = vec![u64::ZERO; num_ct_blocks];
+    for (i, ind) in lut_vector_indexes.iter_mut().enumerate() {
+        *ind = <usize as CastInto<u64>>::cast_into(i);
+    }
+    let mut d_lut_vector_indexes = unsafe { CudaVec::<u64>::new_async(num_ct_blocks, &streams, 0) };
+    unsafe { d_lut_vector_indexes.copy_from_cpu_async(&lut_vector_indexes, &streams, 0) };
+    let lwe_indexes_usize: Vec<usize> = (0..num_ct_blocks).collect_vec();
+    let lwe_indexes = lwe_indexes_usize
+        .iter()
+        .map(|&x| <usize as CastInto<u64>>::cast_into(x))
+        .collect_vec();
+    let mut d_output_indexes = unsafe { CudaVec::<u64>::new_async(num_ct_blocks, &streams, 0) };
+    let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_ct_blocks, &streams, 0) };
+    unsafe {
+        d_input_indexes.copy_from_cpu_async(&lwe_indexes, &streams, 0);
+        d_output_indexes.copy_from_cpu_async(&lwe_indexes, &streams, 0);
+    }
+    streams.synchronize();
+
+    let block_info = CudaBlockInfo {
+        degree: crate::shortint::parameters::Degree::new(block_params.message_modulus().0 - 1),
+        message_modulus: block_params.message_modulus(),
+        carry_modulus: block_params.carry_modulus(),
+        atomic_pattern: block_params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+
+    let mut cuda_side_resources = CudaSideResources::new(&streams, block_info);
+
+    for _ in 0..10 {
+        let ct_input = cks.key.encrypt(0);
+        let cloned_ct_input = ct_input.clone();
+        let radix_ct_input = crate::integer::RadixCiphertext::from_blocks(vec![ct_input]);
+        let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+            &radix_ct_input,
+            &cuda_side_resources.streams,
+        );
+        let gpu_sample_input = CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks);
+
+        let (_input, _after_dp, _after_ks, _before_ms, after_ms) = dp_ks_any_ms(
+            gpu_sample_input,
+            max_scalar_mul,
+            &cuda_sks,
+            modulus_switch_config,
+            br_input_modulus_log,
+            &mut cuda_side_resources,
+        );
+
+        let mut after_pbs = d_accumulator.allocate_lwe_bootstrap_result(&mut cuda_side_resources);
+        cuda_sks.lwe_classic_fft_pbs(
+            &after_ms,
+            &mut after_pbs,
+            &d_accumulator,
+            &mut cuda_side_resources,
+        );
+
+        let after_pbs_ct = after_pbs.as_ct_64_cpu(&cuda_side_resources.streams);
+
+        let radix_ct = crate::integer::RadixCiphertext::from_blocks(vec![cloned_ct_input]);
+        let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+            &radix_ct,
+            &cuda_side_resources.streams,
+        );
+
+        unsafe {
+            unchecked_small_scalar_mul_integer_async(
+                &cuda_side_resources.streams,
+                &mut d_ct.ciphertext,
+                max_scalar_mul,
+                block_params.message_modulus(),
+                block_params.carry_modulus(),
+            );
+        }
+        cuda_side_resources.streams.synchronize();
+
+        let mut after_pbs_shortint_ct: CudaUnsignedRadixCiphertext =
+            cuda_sks.create_trivial_zero_radix(1, &cuda_side_resources.streams);
+
+        cuda_sks.apply_lookup_table(
+            &mut after_pbs_shortint_ct.ciphertext,
+            &d_ct.ciphertext,
+            &id_lut,
+            0..1,
+            &cuda_side_resources.streams,
+        );
+
+        let shortint_res_list = after_pbs_shortint_ct
+            .ciphertext
+            .d_blocks
+            .to_lwe_ciphertext_list(&cuda_side_resources.streams);
+        let shortint_res_ct = LweCiphertext::from_container(
+            shortint_res_list.clone().into_container(),
+            shortint_res_list.ciphertext_modulus(),
+        );
+        assert_eq!(after_pbs_ct.as_view(), shortint_res_ct.as_view());
+    }
+}
+
+create_gpu_parameterized_test!(sanity_check_encrypt_dp_ks_pbs_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_GAUSSIAN_2M128,
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+use crate::shortint::CarryModulus;
+fn encrypt_dp_ks_any_ms_inner_helper_gpu(
+    params: AtomicPatternParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+) {
+    let thread_cks: crate::integer::ClientKey;
+    let thread_cuda_sks: CudaServerKey;
+
+    let (cks, cuda_sks) = if should_use_single_key_debug() {
+        (single_cks, single_cuda_sks)
+    } else {
+        let block_params: ShortintParameterSet = params.into();
+        thread_cks = crate::integer::ClientKey::new(block_params);
+        let thread_compressed_server_key =
+            CompressedServerKey::new_radix_compressed_server_key(&thread_cks);
+        thread_cuda_sks =
+            CudaServerKey::decompress_from_cpu(&thread_compressed_server_key, streams);
+        (&thread_cks.key, &thread_cuda_sks)
+    };
+    let modulus_switch_config = cuda_sks.noise_simulation_modulus_switch_config();
+
+    let ct_input = cks.unchecked_encrypt(msg);
+    let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct_input.ct, streams);
+    let gpu_sample_input = CudaDynLwe::U64(d_ct_input);
+    let block_info = CudaBlockInfo {
+        degree: crate::shortint::parameters::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+
+    let mut cuda_side_resources = CudaSideResources::new(streams, block_info);
+
+    let (input_gpu, after_dp_gpu, after_ks_gpu, after_drift_gpu, after_ms_gpu) = dp_ks_any_ms(
+        gpu_sample_input,
+        scalar_for_multiplication,
+        cuda_sks,
+        modulus_switch_config,
+        br_input_modulus_log,
+        &mut cuda_side_resources,
+    );
+    cuda_side_resources.streams.synchronize();
+
+    let input_ct = input_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let after_dp_ct = after_dp_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let after_ks_ct = after_ks_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let after_ms_ct = after_ms_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+    let before_ms_gpu: &CudaDynLwe = after_drift_gpu.as_ref().unwrap_or(&after_ks_gpu);
+    let before_ms_ct = before_ms_gpu.as_ct_64_cpu(&cuda_side_resources.streams);
+
+    let output_encoding = ShortintEncoding::from_parameters(params, PaddingBit::Yes);
+
+    match &cks.atomic_pattern {
+        AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => (
+            DecryptionAndNoiseResult::new_from_lwe(
+                &input_ct,
+                &standard_atomic_pattern_client_key.large_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &after_dp_ct,
+                &standard_atomic_pattern_client_key.large_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &after_ks_ct,
+                &standard_atomic_pattern_client_key.small_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &before_ms_ct,
+                &standard_atomic_pattern_client_key.small_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+            DecryptionAndNoiseResult::new_from_lwe(
+                &after_ms_ct,
+                &standard_atomic_pattern_client_key.small_lwe_secret_key(),
+                msg,
+                &output_encoding,
+            ),
+        ),
+        AtomicPatternClientKey::KeySwitch32(_) => todo!(),
+    }
+}
+
+fn encrypt_dp_ks_any_ms_noise_helper_gpu(
+    params: AtomicPatternParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    NoiseSample,
+    NoiseSample,
+    NoiseSample,
+    NoiseSample,
+    NoiseSample,
+) {
+    let (input, after_dp, after_ks, before_ms, after_ms) = encrypt_dp_ks_any_ms_inner_helper_gpu(
+        params,
+        single_cks,
+        single_cuda_sks,
+        msg,
+        scalar_for_multiplication,
+        br_input_modulus_log,
+        streams,
+    );
+
+    (
+        input
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_dp
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_ks
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        before_ms
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_ms
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+    )
+}
+
+fn encrypt_dp_ks_any_ms_pfail_helper_gpu(
+    params: AtomicPatternParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> DecryptionAndNoiseResult {
+    let (_input, _after_dp, _after_ks, after_ms, _after_pbs) =
+        encrypt_dp_ks_any_ms_inner_helper_gpu(
+            params,
+            single_cks,
+            single_cuda_sks,
+            msg,
+            scalar_for_multiplication,
+            br_input_modulus_log,
+            streams,
+        );
+
+    after_ms
+}
+
+/// GPU version of the noise checking test
+fn noise_check_encrypt_dp_ks_ms_noise_gpu(params: MetaParameters) {
+    let params = params.compute_parameters;
+    let noise_simulation_ksk =
+        NoiseSimulationLweKeyswitchKey::new_from_atomic_pattern_parameters(params);
+    let noise_simulation_modulus_switch_config =
+        NoiseSimulationModulusSwitchConfig::new_from_atomic_pattern_parameters(params);
+
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+    let modulus_switch_config_gpu = cuda_sks.noise_simulation_modulus_switch_config();
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+    let expected_average_after_ms =
+        modulus_switch_config_gpu.expected_average_after_ms(params.polynomial_size());
+
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let (_input_sim, _after_dp_sim, _after_ks_sim, _after_drift_sim, after_ms_sim) = {
+        let noise_simulation = NoiseSimulationLwe::encrypt(&cks.key, 0);
+        dp_ks_any_ms(
+            noise_simulation,
+            max_scalar_mul,
+            &noise_simulation_ksk,
+            noise_simulation_modulus_switch_config.as_ref(),
+            br_input_modulus_log,
+            &mut (),
+        )
+    };
+    // Check that the circuit is correct with respect to core implementation, i.e. does not crash on
+    // dimension checks
+    let ct_input = cks.key.encrypt(0);
+    let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct_input.ct, &streams);
+    let gpu_sample_input = CudaDynLwe::U64(d_ct_input);
+    let (expected_lwe_dimension_out, expected_modulus_f64_out) = {
+        // Create CudaBlockInfo from parameters for this call
+        let block_info = CudaBlockInfo {
+            degree: crate::shortint::parameters::Degree::new(params.message_modulus().0 - 1),
+            message_modulus: params.message_modulus(),
+            carry_modulus: params.carry_modulus(),
+            atomic_pattern: params.atomic_pattern(),
+            noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+        };
+        let mut side_resources = CudaSideResources::new(&streams, block_info);
+
+        let (_input, _after_dp, _after_ks, _after_drift, after_ms) = dp_ks_any_ms(
+            gpu_sample_input,
+            max_scalar_mul,
+            &cuda_sks,
+            modulus_switch_config_gpu,
+            br_input_modulus_log,
+            &mut side_resources,
+        );
+
+        (after_ms.lwe_dimension(), after_ms.raw_modulus_float())
+    };
+
+    assert_eq!(after_ms_sim.lwe_dimension(), expected_lwe_dimension_out);
+    assert_eq!(after_ms_sim.modulus().as_f64(), expected_modulus_f64_out);
+
+    let cleartext_modulus = params.message_modulus().0 * params.carry_modulus().0;
+    let mut noise_samples_before_ms = vec![];
+    let mut noise_samples_after_ms = vec![];
+
+    let sample_count_per_msg = 1000usize;
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    for _ in 0..cleartext_modulus {
+        let (current_noise_sample_before_ms, current_noise_samples_after_ms): (Vec<_>, Vec<_>) = (0
+            ..sample_count_per_msg)
+            .collect::<Vec<_>>()
+            .chunks(chunk_size)
+            .flat_map(|chunk| {
+                chunk
+                    .into_par_iter()
+                    .map(|i| {
+                        let local_stream = &vec_local_streams[*i % chunk_size];
+                        let (_input, _after_dp, _after_ks, before_ms, after_ms) =
+                            encrypt_dp_ks_any_ms_noise_helper_gpu(
+                                params,
+                                &cks.key,
+                                &cuda_sks,
+                                0,
+                                max_scalar_mul,
+                                br_input_modulus_log,
+                                local_stream,
+                            );
+                        (before_ms.value, after_ms.value)
+                    })
+                    .collect::<Vec<_>>()
+            })
+            .unzip();
+
+        noise_samples_before_ms.extend(current_noise_sample_before_ms);
+        noise_samples_after_ms.extend(current_noise_samples_after_ms);
+    }
+
+    let before_ms_normality = normality_check(&noise_samples_before_ms, "before ms", 0.01);
+
+    let after_ms_is_ok = mean_and_variance_check(
+        &noise_samples_after_ms,
+        "after_ms",
+        expected_average_after_ms,
+        after_ms_sim.variance(),
+        params.lwe_noise_distribution(),
+        after_ms_sim.lwe_dimension(),
+        after_ms_sim.modulus().as_f64(),
+    );
+
+    assert!(before_ms_normality.null_hypothesis_is_valid && after_ms_is_ok);
+}
+
+create_gpu_parameterized_test!(noise_check_encrypt_dp_ks_ms_noise_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_GAUSSIAN_2M128,
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+fn noise_check_encrypt_dp_ks_ms_pfail_gpu(meta_params: MetaParameters) {
+    let mut ap_params = meta_params.compute_parameters;
+    let (pfail_test_meta, params) = {
+        let original_message_modulus = ap_params.message_modulus();
+        let original_carry_modulus = ap_params.carry_modulus();
+
+        // For now only allow 2_2 parameters, and see later for heuristics to use
+        assert_eq!(original_message_modulus.0, 4);
+        assert_eq!(original_carry_modulus.0, 4);
+
+        // Update parameters to fail more frequently by inflating the carry modulus, allows to keep
+        // the max multiplication without risks of message overflow
+        let (original_pfail_and_precision, new_expected_pfail_and_precision) =
+            update_ap_params_for_pfail(
+                &mut ap_params,
+                original_message_modulus,
+                CarryModulus(1 << 5),
+            );
+
+        let pfail_test_meta = if should_run_short_pfail_tests_debug() {
+            let expected_fails = 200;
+            PfailTestMeta::new_with_desired_expected_fails(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                expected_fails,
+            )
+        } else {
+            let total_runs = 1_000_000;
+            PfailTestMeta::new_with_total_runs(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                total_runs,
+            )
+        };
+
+        (pfail_test_meta, ap_params)
+    };
+
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+    let total_runs_for_expected_fails = pfail_test_meta.total_runs_for_expected_fails();
+
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    let measured_fails: f64 = (0..total_runs_for_expected_fails)
+        .collect::<Vec<_>>()
+        .chunks(chunk_size)
+        .flat_map(|chunk| {
+            chunk
+                .into_par_iter()
+                .map(|i| {
+                    let local_streams = &vec_local_streams[*i as usize % chunk_size];
+                    let after_ms_decryption_result = encrypt_dp_ks_any_ms_pfail_helper_gpu(
+                        params,
+                        &cks.key,
+                        &cuda_sks,
+                        0,
+                        max_scalar_mul,
+                        br_input_modulus_log,
+                        local_streams,
+                    );
+                    after_ms_decryption_result.failure_as_f64()
+                })
+                .collect::<Vec<_>>()
+        })
+        .sum();
+    let test_result = PfailTestResult { measured_fails };
+
+    pfail_check(&pfail_test_meta, test_result);
+}
+
+create_gpu_parameterized_test!(noise_check_encrypt_dp_ks_ms_pfail_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_GAUSSIAN_2M128,
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
@@ -0,0 +1,3 @@
+pub mod br_dp_ks_ms;
+pub mod dp_ks_ms;
+pub mod utils;
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/mod.rs
@@ -0,0 +1 @@
+pub mod noise_simulation;
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs
@@ -0,0 +1,682 @@
+use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::{
+    AllocateCenteredBinaryShiftedStandardModSwitchResult,
+    AllocateDriftTechniqueStandardModSwitchResult, AllocateLweBootstrapResult,
+    AllocateLweKeyswitchResult, AllocateStandardModSwitchResult,
+    CenteredBinaryShiftedStandardModSwitch, DriftTechniqueStandardModSwitch,
+    LweClassicFftBootstrap, LweKeyswitch, ScalarMul, StandardModSwitch,
+};
+use crate::core_crypto::commons::noise_formulas::noise_simulation::{
+    NoiseSimulationLweFourier128Bsk, NoiseSimulationLweFourierBsk,
+};
+use crate::core_crypto::gpu::algorithms::lwe_keyswitch::cuda_keyswitch_lwe_ciphertext;
+use crate::core_crypto::gpu::cuda_modulus_switch_ciphertext;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaModulusSwitchNoiseReductionConfiguration;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::prelude::*;
+use crate::integer::gpu::ciphertext::info::CudaBlockInfo;
+use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
+use crate::integer::gpu::server_key::radix::{CudaNoiseSquashingKey, CudaRadixCiphertextInfo};
+use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
+use crate::integer::gpu::{
+    cuda_centered_modulus_switch_64, unchecked_small_scalar_mul_integer_async, CudaStreams,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::NoiseSimulationModulusSwitchConfig;
+
+/// Side resources for CUDA operations in noise simulation
+#[derive(Clone)]
+pub struct CudaSideResources {
+    pub streams: CudaStreams,
+    pub block_info: CudaBlockInfo,
+}
+
+impl CudaSideResources {
+    pub fn new(streams: &CudaStreams, block_info: CudaBlockInfo) -> Self {
+        Self {
+            streams: streams.clone(),
+            block_info,
+        }
+    }
+}
+
+/// GPU version of DynLwe for CUDA operations
+#[derive(Clone)]
+pub enum CudaDynLwe {
+    U32(CudaLweCiphertextList<u32>),
+    U64(CudaLweCiphertextList<u64>),
+    U128(CudaLweCiphertextList<u128>),
+}
+
+impl CudaDynLwe {
+    pub fn lwe_dimension(&self) -> LweDimension {
+        match self {
+            Self::U32(cuda_lwe) => cuda_lwe.lwe_dimension(),
+            Self::U64(cuda_lwe) => cuda_lwe.lwe_dimension(),
+            Self::U128(cuda_lwe) => cuda_lwe.lwe_dimension(),
+        }
+    }
+
+    pub fn raw_modulus_float(&self) -> f64 {
+        match self {
+            Self::U32(cuda_lwe) => cuda_lwe.ciphertext_modulus().raw_modulus_float(),
+            Self::U64(cuda_lwe) => cuda_lwe.ciphertext_modulus().raw_modulus_float(),
+            Self::U128(cuda_lwe) => cuda_lwe.ciphertext_modulus().raw_modulus_float(),
+        }
+    }
+
+    pub fn as_lwe_32(&self) -> &CudaLweCiphertextList<u32> {
+        match self {
+            Self::U32(cuda_lwe) => cuda_lwe,
+            Self::U64(_) => panic!("Tried getting a u64 CudaLweCiphertextList as u32."),
+            Self::U128(_) => panic!("Tried getting a u128 CudaLweCiphertextList as u32."),
+        }
+    }
+
+    pub fn as_lwe_64(&self) -> &CudaLweCiphertextList<u64> {
+        match self {
+            Self::U32(_) => panic!("Tried getting a u32 CudaLweCiphertextList as u64."),
+            Self::U64(cuda_lwe) => cuda_lwe,
+            Self::U128(_) => panic!("Tried getting a u128 CudaLweCiphertextList as u64."),
+        }
+    }
+
+    pub fn as_lwe_128(&self) -> &CudaLweCiphertextList<u128> {
+        match self {
+            Self::U32(_) => panic!("Tried getting a u32 CudaLweCiphertextList as u128."),
+            Self::U64(_) => panic!("Tried getting a u64 CudaLweCiphertextList as u128."),
+            Self::U128(cuda_lwe) => cuda_lwe,
+        }
+    }
+
+    pub fn into_lwe_32(self) -> CudaLweCiphertextList<u32> {
+        match self {
+            Self::U32(cuda_lwe) => cuda_lwe,
+            Self::U64(_) => panic!("Tried converting a u64 CudaLweCiphertextList to u32."),
+            Self::U128(_) => panic!("Tried converting a u128 CudaLweCiphertextList to u32."),
+        }
+    }
+
+    pub fn into_lwe_64(self) -> CudaLweCiphertextList<u64> {
+        match self {
+            Self::U32(_) => panic!("Tried converting a u32 CudaLweCiphertextList to u64."),
+            Self::U64(cuda_lwe) => cuda_lwe,
+            Self::U128(_) => panic!("Tried converting a u128 CudaLweCiphertextList to u64."),
+        }
+    }
+
+    pub fn into_lwe_128(self) -> CudaLweCiphertextList<u128> {
+        match self {
+            Self::U32(_) => panic!("Tried converting a u32 CudaLweCiphertextList to u128."),
+            Self::U64(_) => panic!("Tried converting a u64 CudaLweCiphertextList to u128."),
+            Self::U128(cuda_lwe) => cuda_lwe,
+        }
+    }
+    pub fn as_ct_64_cpu(&self, streams: &CudaStreams) -> LweCiphertext<Vec<u64>> {
+        match self {
+            Self::U32(_) => panic!("Tried getting a u32 CudaLweCiphertextList as u64."),
+            Self::U64(_cuda_lwe) => {
+                let cpu_lwe_list = self.as_lwe_64().to_lwe_ciphertext_list(streams);
+                LweCiphertext::from_container(
+                    cpu_lwe_list.clone().into_container(),
+                    cpu_lwe_list.ciphertext_modulus(),
+                )
+            }
+            Self::U128(_) => panic!("Tried getting a u128 CudaLweCiphertextList as u64."),
+        }
+    }
+
+    pub fn from_lwe_32(cuda_lwe: CudaLweCiphertextList<u32>) -> Self {
+        Self::U32(cuda_lwe)
+    }
+
+    pub fn from_lwe_64(cuda_lwe: CudaLweCiphertextList<u64>) -> Self {
+        Self::U64(cuda_lwe)
+    }
+
+    pub fn from_lwe_128(cuda_lwe: CudaLweCiphertextList<u128>) -> Self {
+        Self::U128(cuda_lwe)
+    }
+}
+
+impl ScalarMul<u64> for CudaDynLwe {
+    type Output = Self;
+    type SideResources = CudaSideResources;
+
+    fn scalar_mul(&self, scalar: u64, side_resources: &mut Self::SideResources) -> Self::Output {
+        match self {
+            Self::U32(_cuda_lwe) => {
+                panic!("U32 scalar mul not implemented for CudaDynLwe - only U64 is supported")
+            }
+            Self::U64(cuda_lwe) => {
+                // Use the block info from side_resources for proper modulus values
+                let mut cuda_radix = CudaRadixCiphertext::new(
+                    cuda_lwe.duplicate(&side_resources.streams),
+                    CudaRadixCiphertextInfo {
+                        blocks: vec![side_resources.block_info],
+                    },
+                );
+                unsafe {
+                    unchecked_small_scalar_mul_integer_async(
+                        &side_resources.streams,
+                        &mut cuda_radix,
+                        scalar,
+                        side_resources.block_info.message_modulus,
+                        side_resources.block_info.carry_modulus,
+                    );
+                    side_resources.streams.synchronize();
+                }
+
+                Self::U64(cuda_radix.d_blocks)
+            }
+            Self::U128(_cuda_lwe) => {
+                panic!("U128 scalar mul not implemented for CudaDynLwe - only U64 is supported")
+            }
+        }
+    }
+}
+
+// Extensions for NoiseSimulationLweFourierBsk to support GPU operations
+impl NoiseSimulationLweFourierBsk {
+    pub fn matches_actual_bsk_gpu(&self, lwe_bsk: &CudaBootstrappingKey<u64>) -> bool {
+        let input_lwe_dimension = self.input_lwe_dimension();
+        let glwe_size = self.output_glwe_size();
+        let polynomial_size = self.output_polynomial_size();
+        let decomp_base_log = self.decomp_base_log();
+        let decomp_level_count = self.decomp_level_count();
+
+        match lwe_bsk {
+            CudaBootstrappingKey::Classic(cuda_bsk) => {
+                let bsk_input_lwe_dimension = cuda_bsk.input_lwe_dimension();
+                let bsk_glwe_size = cuda_bsk.glwe_dimension().to_glwe_size();
+                let bsk_polynomial_size = cuda_bsk.polynomial_size();
+                let bsk_decomp_base_log = cuda_bsk.decomp_base_log();
+                let bsk_decomp_level_count = cuda_bsk.decomp_level_count();
+
+                input_lwe_dimension == bsk_input_lwe_dimension
+                    && glwe_size == bsk_glwe_size
+                    && polynomial_size == bsk_polynomial_size
+                    && decomp_base_log == bsk_decomp_base_log
+                    && decomp_level_count == bsk_decomp_level_count
+            }
+            CudaBootstrappingKey::MultiBit(cuda_mb_bsk) => {
+                let bsk_input_lwe_dimension = cuda_mb_bsk.input_lwe_dimension();
+                let bsk_glwe_size = cuda_mb_bsk.glwe_dimension().to_glwe_size();
+                let bsk_polynomial_size = cuda_mb_bsk.polynomial_size();
+                let bsk_decomp_base_log = cuda_mb_bsk.decomp_base_log();
+                let bsk_decomp_level_count = cuda_mb_bsk.decomp_level_count();
+
+                input_lwe_dimension == bsk_input_lwe_dimension
+                    && glwe_size == bsk_glwe_size
+                    && polynomial_size == bsk_polynomial_size
+                    && decomp_base_log == bsk_decomp_base_log
+                    && decomp_level_count == bsk_decomp_level_count
+            }
+        }
+    }
+}
+
+// Extensions for NoiseSimulationLweFourier128Bsk to support GPU operations (for u128 noise
+// squashing)
+impl NoiseSimulationLweFourier128Bsk {
+    pub fn matches_actual_bsk_gpu(&self, lwe_bsk: &CudaBootstrappingKey<u128>) -> bool {
+        let input_lwe_dimension = self.input_lwe_dimension();
+        let glwe_size = self.output_glwe_size();
+        let polynomial_size = self.output_polynomial_size();
+        let decomp_base_log = self.decomp_base_log();
+        let decomp_level_count = self.decomp_level_count();
+
+        match lwe_bsk {
+            CudaBootstrappingKey::Classic(cuda_bsk) => {
+                let bsk_input_lwe_dimension = cuda_bsk.input_lwe_dimension();
+                let bsk_glwe_size = cuda_bsk.glwe_dimension().to_glwe_size();
+                let bsk_polynomial_size = cuda_bsk.polynomial_size();
+                let bsk_decomp_base_log = cuda_bsk.decomp_base_log();
+                let bsk_decomp_level_count = cuda_bsk.decomp_level_count();
+
+                input_lwe_dimension == bsk_input_lwe_dimension
+                    && glwe_size == bsk_glwe_size
+                    && polynomial_size == bsk_polynomial_size
+                    && decomp_base_log == bsk_decomp_base_log
+                    && decomp_level_count == bsk_decomp_level_count
+            }
+            CudaBootstrappingKey::MultiBit(cuda_mb_bsk) => {
+                let bsk_input_lwe_dimension = cuda_mb_bsk.input_lwe_dimension();
+                let bsk_glwe_size = cuda_mb_bsk.glwe_dimension().to_glwe_size();
+                let bsk_polynomial_size = cuda_mb_bsk.polynomial_size();
+                let bsk_decomp_base_log = cuda_mb_bsk.decomp_base_log();
+                let bsk_decomp_level_count = cuda_mb_bsk.decomp_level_count();
+
+                input_lwe_dimension == bsk_input_lwe_dimension
+                    && glwe_size == bsk_glwe_size
+                    && polynomial_size == bsk_polynomial_size
+                    && decomp_base_log == bsk_decomp_base_log
+                    && decomp_level_count == bsk_decomp_level_count
+            }
+        }
+    }
+}
+
+impl AllocateStandardModSwitchResult for CudaDynLwe {
+    type Output = Self;
+    type SideResources = CudaSideResources;
+
+    fn allocate_standard_mod_switch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        match self {
+            Self::U32(cuda_lwe) => {
+                let new_cuda_lwe = CudaLweCiphertextList::new(
+                    cuda_lwe.lwe_dimension(),
+                    cuda_lwe.lwe_ciphertext_count(),
+                    cuda_lwe.ciphertext_modulus(),
+                    &side_resources.streams,
+                );
+                Self::U32(new_cuda_lwe)
+            }
+            Self::U64(cuda_lwe) => {
+                let new_cuda_lwe = CudaLweCiphertextList::new(
+                    cuda_lwe.lwe_dimension(),
+                    cuda_lwe.lwe_ciphertext_count(),
+                    cuda_lwe.ciphertext_modulus(),
+                    &side_resources.streams,
+                );
+                Self::U64(new_cuda_lwe)
+            }
+            Self::U128(cuda_lwe) => {
+                let new_cuda_lwe = CudaLweCiphertextList::new(
+                    cuda_lwe.lwe_dimension(),
+                    cuda_lwe.lwe_ciphertext_count(),
+                    cuda_lwe.ciphertext_modulus(),
+                    &side_resources.streams,
+                );
+                Self::U128(new_cuda_lwe)
+            }
+        }
+    }
+}
+
+impl StandardModSwitch<Self> for CudaDynLwe {
+    type SideResources = CudaSideResources;
+
+    fn standard_mod_switch(
+        &self,
+        output_modulus_log: CiphertextModulusLog,
+        output: &mut Self,
+        side_resources: &mut Self::SideResources,
+    ) {
+        match (self, output) {
+            (Self::U32(_input), Self::U32(_output_cuda_lwe)) => {
+                panic!("U32 modulus switch not implemented for CudaDynLwe - only U64 is supported");
+            }
+            (Self::U64(input), Self::U64(output_cuda_lwe)) => {
+                let internal_output = input.duplicate(&side_resources.streams);
+                cuda_modulus_switch_ciphertext(
+                    &mut output_cuda_lwe.0.d_vec,
+                    output_modulus_log.0 as u32,
+                    &side_resources.streams,
+                );
+                let mut cpu_lwe = internal_output.to_lwe_ciphertext_list(&side_resources.streams);
+                let shift_to_map_to_native = u64::BITS - output_modulus_log.0 as u32;
+                for val in cpu_lwe.as_mut_view().into_container().iter_mut() {
+                    *val <<= shift_to_map_to_native;
+                }
+                let d_after_ms = CudaLweCiphertextList::from_lwe_ciphertext_list(
+                    &cpu_lwe,
+                    &side_resources.streams,
+                );
+
+                *output_cuda_lwe = d_after_ms;
+            }
+            (Self::U128(_input), Self::U128(_output_cuda_lwe)) => {
+                panic!("U128 modulus switch not implemented for CudaDynLwe - only U64 is supported")
+            }
+            _ => panic!("Inconsistent inputs/outputs for CudaDynLwe StandardModSwitch"),
+        }
+    }
+}
+
+impl AllocateCenteredBinaryShiftedStandardModSwitchResult for CudaDynLwe {
+    type Output = Self;
+    type SideResources = CudaSideResources;
+
+    fn allocate_centered_binary_shifted_standard_mod_switch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        self.allocate_standard_mod_switch_result(side_resources)
+    }
+}
+
+impl CenteredBinaryShiftedStandardModSwitch<Self> for CudaDynLwe {
+    type SideResources = CudaSideResources;
+
+    fn centered_binary_shifted_and_standard_mod_switch(
+        &self,
+        output_modulus_log: CiphertextModulusLog,
+        output: &mut Self,
+        side_resources: &mut Self::SideResources,
+    ) {
+        match (self, output) {
+            (Self::U32(_input), Self::U32(_output_cuda_lwe)) => {
+                panic!("U32 centered binary shifted modulus switch not implemented for CudaDynLwe - only U64 is supported")
+            }
+            (Self::U64(input), Self::U64(output_cuda_lwe)) => unsafe {
+                let mut internal_output = output_cuda_lwe.duplicate(&side_resources.streams);
+                cuda_centered_modulus_switch_64(
+                    side_resources.streams.ptr[0],
+                    0u32,
+                    internal_output.0.d_vec.as_mut_c_ptr(0),
+                    input.0.d_vec.as_c_ptr(0),
+                    input.lwe_dimension().0 as u32,
+                    output_modulus_log.0 as u32,
+                );
+                side_resources.streams.synchronize();
+                let cpu_lwe = internal_output.into_lwe_ciphertext(&side_resources.streams);
+                let mut cpu_ct = LweCiphertext::from_container(
+                    cpu_lwe.clone().into_container(),
+                    cpu_lwe.ciphertext_modulus(),
+                );
+                let shift_to_map_to_native = u64::BITS - output_modulus_log.0 as u32;
+                for val in cpu_ct.as_mut() {
+                    *val <<= shift_to_map_to_native;
+                }
+                let d_after_ms =
+                    CudaLweCiphertextList::from_lwe_ciphertext(&cpu_ct, &side_resources.streams);
+                *output_cuda_lwe = d_after_ms;
+            },
+            (Self::U128(_input), Self::U128(_output_cuda_lwe)) => {
+                panic!("U128 centered binary shifted modulus switch not implemented for CudaDynLwe - only U64 is supported")
+            }
+            _ => panic!("Inconsistent inputs/outputs for CudaDynLwe StandardModSwitch"),
+        }
+    }
+}
+
+impl DriftTechniqueStandardModSwitch<Self, Self, Self> for CudaDynLwe {
+    type SideResources = CudaSideResources;
+
+    fn drift_technique_and_standard_mod_switch(
+        &self,
+        _output_modulus_log: CiphertextModulusLog,
+        _input: &Self,
+        _after_drift_technique: &mut Self,
+        _after_mod_switch: &mut Self,
+        _side_resources: &mut Self::SideResources,
+    ) {
+        panic!("Drift technique is being deprecated, use other flavors of mod switch instead")
+    }
+}
+
+impl AllocateLweKeyswitchResult for CudaServerKey {
+    type Output = CudaDynLwe;
+    type SideResources = CudaSideResources;
+
+    fn allocate_lwe_keyswitch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        let output_lwe_dimension = self
+            .key_switching_key
+            .output_key_lwe_size()
+            .to_lwe_dimension();
+        let lwe_ciphertext_count = LweCiphertextCount(1);
+        let ciphertext_modulus = self.ciphertext_modulus;
+
+        let cuda_lwe = CudaLweCiphertextList::new(
+            output_lwe_dimension,
+            lwe_ciphertext_count,
+            ciphertext_modulus,
+            &side_resources.streams,
+        );
+        CudaDynLwe::U64(cuda_lwe)
+    }
+}
+
+impl LweKeyswitch<CudaDynLwe, CudaDynLwe> for CudaServerKey {
+    type SideResources = CudaSideResources;
+
+    fn lwe_keyswitch(
+        &self,
+        input: &CudaDynLwe,
+        output: &mut CudaDynLwe,
+        side_resources: &mut Self::SideResources,
+    ) {
+        match (input, output) {
+            (CudaDynLwe::U64(input_cuda_lwe), CudaDynLwe::U64(output_cuda_lwe)) => {
+                let input_indexes = CudaVec::new(1, &side_resources.streams, 0);
+                let output_indexes = CudaVec::new(1, &side_resources.streams, 0);
+
+                cuda_keyswitch_lwe_ciphertext(
+                    &self.key_switching_key,
+                    input_cuda_lwe,
+                    output_cuda_lwe,
+                    &input_indexes,
+                    &output_indexes,
+                    false,
+                    &side_resources.streams,
+                    false,
+                );
+            }
+            (CudaDynLwe::U32(_), CudaDynLwe::U32(_)) => {
+                panic!("U32 keyswitch not implemented for CudaServerKey - only U64 is supported");
+            }
+            (CudaDynLwe::U128(_), CudaDynLwe::U128(_)) => {
+                panic!("U128 keyswitch not implemented for CudaServerKey - only U64 is supported");
+            }
+            _ => panic!("Inconsistent input/output types for CudaDynLwe keyswitch"),
+        }
+    }
+}
+
+impl AllocateDriftTechniqueStandardModSwitchResult for CudaServerKey {
+    type AfterDriftOutput = CudaDynLwe;
+    type AfterMsOutput = CudaDynLwe;
+    type SideResources = CudaSideResources;
+
+    fn allocate_drift_technique_standard_mod_switch_result(
+        &self,
+        _side_resources: &mut Self::SideResources,
+    ) -> (Self::AfterDriftOutput, Self::AfterMsOutput) {
+        panic!("Drift technique is being deprecated, use other flavors of mod switch instead")
+    }
+}
+
+impl DriftTechniqueStandardModSwitch<CudaDynLwe, CudaDynLwe, CudaDynLwe> for CudaServerKey {
+    type SideResources = CudaSideResources;
+
+    fn drift_technique_and_standard_mod_switch(
+        &self,
+        _output_modulus_log: CiphertextModulusLog,
+        _input: &CudaDynLwe,
+        _after_drift_technique: &mut CudaDynLwe,
+        _after_mod_switch: &mut CudaDynLwe,
+        _side_resources: &mut Self::SideResources,
+    ) {
+        panic!("Drift technique is being deprecated, use other flavors of mod switch instead")
+    }
+}
+
+impl CudaServerKey {
+    pub fn br_input_modulus_log(&self) -> CiphertextModulusLog {
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(bsk) => {
+                bsk.polynomial_size().to_blind_rotation_input_modulus_log()
+            }
+            CudaBootstrappingKey::MultiBit(mb_bsk) => mb_bsk
+                .polynomial_size()
+                .to_blind_rotation_input_modulus_log(),
+        }
+    }
+    pub fn noise_simulation_modulus_switch_config(
+        &self,
+    ) -> NoiseSimulationModulusSwitchConfig<&Self> {
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(bsk) => match &bsk.ms_noise_reduction_configuration {
+                None => NoiseSimulationModulusSwitchConfig::Standard,
+                Some(CudaModulusSwitchNoiseReductionConfiguration::Centered) => {
+                    NoiseSimulationModulusSwitchConfig::CenteredMeanNoiseReduction
+                }
+            },
+            CudaBootstrappingKey::MultiBit(_) => {
+                todo!()
+            }
+        }
+    }
+}
+
+impl CudaNoiseSquashingKey {
+    pub fn noise_simulation_modulus_switch_config(
+        &self,
+    ) -> NoiseSimulationModulusSwitchConfig<&Self> {
+        match &self.bootstrapping_key {
+            CudaBootstrappingKey::Classic(bsk) => match &bsk.ms_noise_reduction_configuration {
+                None => NoiseSimulationModulusSwitchConfig::Standard,
+                Some(CudaModulusSwitchNoiseReductionConfiguration::Centered) => {
+                    NoiseSimulationModulusSwitchConfig::CenteredMeanNoiseReduction
+                }
+            },
+            CudaBootstrappingKey::MultiBit(_) => {
+                todo!()
+            }
+        }
+    }
+}
+
+impl AllocateDriftTechniqueStandardModSwitchResult for CudaNoiseSquashingKey {
+    type AfterDriftOutput = CudaDynLwe;
+    type AfterMsOutput = CudaDynLwe;
+    type SideResources = CudaSideResources;
+
+    fn allocate_drift_technique_standard_mod_switch_result(
+        &self,
+        _side_resources: &mut Self::SideResources,
+    ) -> (Self::AfterDriftOutput, Self::AfterMsOutput) {
+        panic!("Drift technique is being deprecated, use other flavors of mod switch instead")
+    }
+}
+
+impl DriftTechniqueStandardModSwitch<CudaDynLwe, CudaDynLwe, CudaDynLwe> for CudaNoiseSquashingKey {
+    type SideResources = CudaSideResources;
+
+    fn drift_technique_and_standard_mod_switch(
+        &self,
+        _output_modulus_log: CiphertextModulusLog,
+        _input: &CudaDynLwe,
+        _after_drift_technique: &mut CudaDynLwe,
+        _after_mod_switch: &mut CudaDynLwe,
+        _side_resources: &mut Self::SideResources,
+    ) {
+        panic!("Drift technique is being deprecated, use other flavors of mod switch instead")
+    }
+}
+
+/// Implementation for CudaGlweCiphertextList<u64> to return CudaDynLwe (for test compatibility)
+impl AllocateLweBootstrapResult for CudaGlweCiphertextList<u64> {
+    type Output = CudaDynLwe;
+    type SideResources = CudaSideResources;
+
+    fn allocate_lwe_bootstrap_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        // For PBS result, we allocate LWE ciphertexts wrapped in CudaDynLwe
+        let lwe_dimension = self
+            .glwe_dimension()
+            .to_equivalent_lwe_dimension(self.polynomial_size());
+
+        let cuda_lwe = CudaLweCiphertextList::new(
+            lwe_dimension,
+            LweCiphertextCount(1),
+            self.ciphertext_modulus(),
+            &side_resources.streams,
+        );
+        CudaDynLwe::U64(cuda_lwe)
+    }
+}
+
+// Implement LweClassicFftBootstrap for CudaServerKey
+impl LweClassicFftBootstrap<CudaDynLwe, CudaDynLwe, CudaGlweCiphertextList<u64>> for CudaServerKey {
+    type SideResources = CudaSideResources;
+
+    fn lwe_classic_fft_pbs(
+        &self,
+        input: &CudaDynLwe,
+        output: &mut CudaDynLwe,
+        accumulator: &crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList<u64>,
+        side_resources: &mut Self::SideResources,
+    ) {
+        use crate::core_crypto::gpu::algorithms::lwe_programmable_bootstrapping::cuda_programmable_bootstrap_lwe_ciphertext;
+        use crate::core_crypto::gpu::vec::CudaVec;
+        use crate::integer::gpu::server_key::CudaBootstrappingKey;
+        use crate::integer::gpu::CastInto;
+
+        match (input, output) {
+            (CudaDynLwe::U64(input_cuda_lwe), CudaDynLwe::U64(output_cuda_lwe)) => {
+                // Create indexes for PBS
+                let num_ct_blocks = 1;
+                let lwe_indexes: Vec<u64> = (0..num_ct_blocks)
+                    .map(<usize as CastInto<u64>>::cast_into)
+                    .collect();
+                let mut d_lut_vector_indexes =
+                    unsafe { CudaVec::<u64>::new_async(num_ct_blocks, &side_resources.streams, 0) };
+                let mut d_input_indexes =
+                    unsafe { CudaVec::<u64>::new_async(num_ct_blocks, &side_resources.streams, 0) };
+                let mut d_output_indexes =
+                    unsafe { CudaVec::<u64>::new_async(num_ct_blocks, &side_resources.streams, 0) };
+
+                unsafe {
+                    d_lut_vector_indexes.copy_from_cpu_async(
+                        &lwe_indexes,
+                        &side_resources.streams,
+                        0,
+                    );
+                    d_input_indexes.copy_from_cpu_async(&lwe_indexes, &side_resources.streams, 0);
+                    d_output_indexes.copy_from_cpu_async(&lwe_indexes, &side_resources.streams, 0);
+                }
+
+                match &self.bootstrapping_key {
+                    CudaBootstrappingKey::Classic(d_bsk) => {
+                        cuda_programmable_bootstrap_lwe_ciphertext(
+                            input_cuda_lwe,
+                            output_cuda_lwe,
+                            accumulator,
+                            &d_lut_vector_indexes,
+                            &d_output_indexes,
+                            &d_input_indexes,
+                            d_bsk,
+                            &side_resources.streams,
+                        );
+                    }
+                    CudaBootstrappingKey::MultiBit(_d_multibit_bsk) => {
+                        panic!("Can not execute MultiBit PBS from classic FFT PBS implementation");
+                    }
+                }
+            }
+            _ => panic!("Only U64 PBS is supported for CudaServerKey"),
+        }
+    }
+}
+
+impl AllocateLweBootstrapResult for CudaGlweCiphertextList<u128> {
+    type Output = CudaDynLwe;
+    type SideResources = CudaSideResources;
+
+    fn allocate_lwe_bootstrap_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        let lwe_dimension = self
+            .glwe_dimension()
+            .to_equivalent_lwe_dimension(self.polynomial_size());
+
+        let cuda_lwe = CudaLweCiphertextList::<u128>::new(
+            lwe_dimension,
+            LweCiphertextCount(1),
+            self.ciphertext_modulus(),
+            &side_resources.streams,
+        );
+        CudaDynLwe::U128(cuda_lwe)
+    }
+}