From 1eb1ff9d89fac49d94191ef27c53fac08b9acdef Mon Sep 17 00:00:00 2001
From: Guillermo Oyarzun <guillermo.oyarzun@zama.ai>
Date: Tue, 2 Dec 2025 18:40:06 +0100
Subject: [PATCH] feat(gpu): create noise and pfail tests cpk

---
 .../tfhe-cuda-backend/cuda/include/zk/zk.h    |   3 +-
 .../cuda/include/zk/zk_enums.h                |   7 +
 .../cuda/include/zk/zk_utilities.h            |  34 +-
 backends/tfhe-cuda-backend/cuda/src/zk/zk.cu  |   5 +-
 backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh |  27 +-
 backends/tfhe-cuda-backend/src/bindings.rs    |   5 +
 tfhe/src/high_level_api/compact_list.rs       |   6 +-
 .../integer/gpu/ciphertext/compact_list.rs    |   4 +-
 tfhe/src/integer/gpu/mod.rs                   |  11 +-
 .../tests_noise_distribution/cpk_ks_ms.rs     | 818 ++++++++++++++++++
 .../radix/tests_noise_distribution/mod.rs     |   1 +
 .../utils/key_switching_test_utils.rs         |  28 +
 .../tests_noise_distribution/utils/mod.rs     |   1 +
 tfhe/src/integer/gpu/zk/mod.rs                |   3 +-
 14 files changed, 937 insertions(+), 16 deletions(-)
 create mode 100644 backends/tfhe-cuda-backend/cuda/include/zk/zk_enums.h
 create mode 100644 tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/cpk_ks_ms.rs
 create mode 100644 tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/key_switching_test_utils.rs
diff --git a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
index 713938f87..066f91d9d 100644
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk.h
@@ -3,6 +3,7 @@
 
 #include "../keyswitch/ks_enums.h"
 #include "../pbs/pbs_enums.h"
+#include "zk_enums.h"
 #include <stdint.h>
 
 extern "C" {
@@ -16,7 +17,7 @@ uint64_t scratch_cuda_expand_without_verification_64(
     uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
     const bool *is_boolean_array, uint32_t num_compact_lists,
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    KS_TYPE casting_key_type, bool allocate_gpu_memory,
+    KS_TYPE casting_key_type, bool allocate_gpu_memory, EXPAND_KIND expand_kind,
     PBS_MS_REDUCTION_T noise_reduction_type);
 
 void cuda_expand_without_verification_64(
diff --git a/backends/tfhe-cuda-backend/cuda/include/zk/zk_enums.h b/backends/tfhe-cuda-backend/cuda/include/zk/zk_enums.h
new file mode 100644
index 000000000..7690307e1
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_enums.h
@@ -0,0 +1,7 @@
+#ifndef CUDA_ZK_ENUMS_H
+#define CUDA_ZK_ENUMS_H
+#include <stdint.h>
+// Additional to the two kinds of expand (no_casting and casting), we have a
+// third that is used only in the noise tests
+enum EXPAND_KIND { NO_CASTING = 0, CASTING = 1, SANITY_CHECK = 2 };
+#endif // CUDA_ZK_ENUMS_H
\ No newline at end of file
diff --git a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
index fa9bbee23..262866dde 100644
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -1,6 +1,5 @@
 #ifndef ZK_UTILITIES_H
 #define ZK_UTILITIES_H
-
 #include "../integer/integer_utilities.h"
 #include "integer/integer.cuh"
 #include <cstdint>
@@ -103,6 +102,7 @@ template <typename Torus> struct zk_expand_mem {
   uint32_t num_compact_lists;
 
   int_radix_lut<Torus> *message_and_carry_extract_luts;
+  int_radix_lut<Torus> *identity_lut;
 
   Torus *tmp_expanded_lwes;
   Torus *tmp_ksed_small_to_big_expanded_lwes;
@@ -113,15 +113,17 @@ template <typename Torus> struct zk_expand_mem {
   expand_job<Torus> *d_expand_jobs;
   expand_job<Torus> *h_expand_jobs;
 
+  EXPAND_KIND expand_kind;
+
   zk_expand_mem(CudaStreams streams, int_radix_params computing_params,
                 int_radix_params casting_params, KS_TYPE casting_key_type,
                 const uint32_t *num_lwes_per_compact_list,
                 const bool *is_boolean_array, uint32_t num_compact_lists,
-                bool allocate_gpu_memory, uint64_t &size_tracker)
+                bool allocate_gpu_memory, uint64_t &size_tracker,
+                EXPAND_KIND expand_kind_in)
       : computing_params(computing_params), casting_params(casting_params),
         num_compact_lists(num_compact_lists),
-        casting_key_type(casting_key_type) {
-
+        casting_key_type(casting_key_type), expand_kind(expand_kind_in) {
     gpu_memory_allocated = allocate_gpu_memory;
 
     // We copy num_lwes_per_compact_list so we get protection against
@@ -136,10 +138,27 @@ template <typename Torus> struct zk_expand_mem {
       num_lwes += this->num_lwes_per_compact_list[i];
     }
 
-    if (computing_params.carry_modulus != computing_params.message_modulus) {
+    if (computing_params.carry_modulus != computing_params.message_modulus &&
+        expand_kind == EXPAND_KIND::CASTING) {
       PANIC("GPU backend requires carry_modulus equal to message_modulus")
     }
 
+    // We create the identity LUT only if we are doing a SANITY_CHECK
+    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+      identity_lut =
+          new int_radix_lut<Torus>(streams, computing_params, 1, 2 * num_lwes,
+                                   allocate_gpu_memory, size_tracker);
+
+      auto identity_lut_f = [](Torus x) -> Torus { return x; };
+
+      generate_device_accumulator<Torus>(
+          streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
+          identity_lut->get_degree(0), identity_lut->get_max_degree(0),
+          casting_params.glwe_dimension, casting_params.polynomial_size,
+          casting_params.message_modulus, casting_params.carry_modulus,
+          identity_lut_f, gpu_memory_allocated);
+    }
+
     auto message_extract_lut_f = [casting_params](Torus x) -> Torus {
       return x % casting_params.message_modulus;
     };
@@ -317,6 +336,11 @@ template <typename Torus> struct zk_expand_mem {
     message_and_carry_extract_luts->release(streams);
     delete message_and_carry_extract_luts;
 
+    if (expand_kind == EXPAND_KIND::SANITY_CHECK) {
+      identity_lut->release(streams);
+      delete identity_lut;
+    }
+
     cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams.stream(0),
                                        streams.gpu_index(0),
                                        gpu_memory_allocated);
diff --git a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
index 6846cba28..9bb20fbcb 100644
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cu
@@ -10,7 +10,7 @@ uint64_t scratch_cuda_expand_without_verification_64(
     uint32_t grouping_factor, const uint32_t *num_lwes_per_compact_list,
     const bool *is_boolean_array, uint32_t num_compact_lists,
     uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    KS_TYPE casting_key_type, bool allocate_gpu_memory,
+    KS_TYPE casting_key_type, bool allocate_gpu_memory, EXPAND_KIND expand_kind,
     PBS_MS_REDUCTION_T noise_reduction_type) {
 
   // Since CUDA backend works with the concept of "big" and "small" key, instead
@@ -37,7 +37,8 @@ uint64_t scratch_cuda_expand_without_verification_64(
       CudaStreams(streams),
       reinterpret_cast<zk_expand_mem<uint64_t> **>(mem_ptr),
       num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
-      computing_params, casting_params, casting_key_type, allocate_gpu_memory);
+      computing_params, casting_params, casting_key_type, allocate_gpu_memory,
+      expand_kind);
 }
 
 void cuda_expand_without_verification_64(
diff --git a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
index 912387fd2..7697b9576 100644
--- a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -12,6 +12,7 @@
 #include "utils/helper.cuh"
 #include "utils/helper_multi_gpu.cuh"
 #include "utils/kernel_dimensions.cuh"
+#include "zk/zk_enums.h"
 #include "zk/zk_utilities.h"
 #include <functional>
 
@@ -54,15 +55,24 @@ __host__ void host_expand_without_verification(
       compact_lwe_lists.total_num_lwes * sizeof(expand_job<Torus>),
       streams.stream(0), streams.gpu_index(0), true);
 
+  if (mem_ptr->expand_kind == EXPAND_KIND::NO_CASTING) {
+    host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
+                                   lwe_array_out, d_expand_jobs, num_lwes);
+    return;
+  }
+
   host_lwe_expand<Torus, params>(streams.stream(0), streams.gpu_index(0),
                                  expanded_lwes, d_expand_jobs, num_lwes);
 
-  auto ksks = casting_keys;
   auto lwe_array_input = expanded_lwes;
+  auto ksks = casting_keys;
   auto message_and_carry_extract_luts = mem_ptr->message_and_carry_extract_luts;
 
   auto lut = mem_ptr->message_and_carry_extract_luts;
   if (casting_key_type == SMALL_TO_BIG) {
+    if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
+      PANIC("SANITY_CHECK not supported for SMALL_TO_BIG casting");
+    }
     // Keyswitch from small to big key if needed
     auto ksed_small_to_big_expanded_lwes =
         mem_ptr->tmp_ksed_small_to_big_expanded_lwes;
@@ -95,6 +105,17 @@ __host__ void host_expand_without_verification(
   into_radix_ciphertext(output, lwe_array_out, 2 * num_lwes, lwe_dimension);
   auto input = new CudaRadixCiphertextFFI;
   into_radix_ciphertext(input, lwe_array_input, 2 * num_lwes, lwe_dimension);
+
+  // This is a special case only for our noise sanity checks
+  // If we are doing a SANITY_CHECK expand, we just apply the identity LUT
+  // This replicates the CPU fallback behaviour of the casting expand
+  if (mem_ptr->expand_kind == EXPAND_KIND::SANITY_CHECK) {
+    integer_radix_apply_univariate_lookup_table<Torus>(
+        streams, output, input, bsks, ksks, mem_ptr->identity_lut,
+        2 * num_lwes);
+    return;
+  }
+
   integer_radix_apply_univariate_lookup_table<Torus>(
       streams, output, input, bsks, ksks, message_and_carry_extract_luts,
       2 * num_lwes);
@@ -106,13 +127,13 @@ __host__ uint64_t scratch_cuda_expand_without_verification(
     const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
     uint32_t num_compact_lists, int_radix_params computing_params,
     int_radix_params casting_params, KS_TYPE casting_key_type,
-    bool allocate_gpu_memory) {
+    bool allocate_gpu_memory, EXPAND_KIND expand_kind) {
 
   uint64_t size_tracker = 0;
   *mem_ptr = new zk_expand_mem<Torus>(
       streams, computing_params, casting_params, casting_key_type,
       num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
-      allocate_gpu_memory, size_tracker);
+      allocate_gpu_memory, size_tracker, expand_kind);
   return size_tracker;
 }
 
diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs
index 86efa2367..fabde9851 100644
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -2471,6 +2471,10 @@ unsafe extern "C" {
 pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
 pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
 pub type KS_TYPE = ffi::c_uint;
+pub const EXPAND_KIND_NO_CASTING: EXPAND_KIND = 0;
+pub const EXPAND_KIND_CASTING: EXPAND_KIND = 1;
+pub const EXPAND_KIND_SANITY_CHECK: EXPAND_KIND = 2;
+pub type EXPAND_KIND = ffi::c_uint;
 unsafe extern "C" {
     pub fn scratch_cuda_expand_without_verification_64(
         streams: CudaStreamsFFI,
@@ -2496,6 +2500,7 @@ unsafe extern "C" {
         pbs_type: PBS_TYPE,
         casting_key_type: KS_TYPE,
         allocate_gpu_memory: bool,
+        expand_kind: EXPAND_KIND,
         noise_reduction_type: PBS_MS_REDUCTION_T,
     ) -> u64;
 }
diff --git a/tfhe/src/high_level_api/compact_list.rs b/tfhe/src/high_level_api/compact_list.rs
index f1180ea47..8968d4288 100644
--- a/tfhe/src/high_level_api/compact_list.rs
+++ b/tfhe/src/high_level_api/compact_list.rs
@@ -350,7 +350,8 @@ impl CompactCiphertextList {
                         .unwrap(),
                     dest_server_key: &cuda_key.key.key,
                 };
-                let expander = gpu_inner.expand(&ksk, streams)?;
+                let expander =
+                    gpu_inner.expand(&ksk, crate::integer::gpu::ZKType::Casting, streams)?;
 
                 Ok(CompactCiphertextListExpander {
                     inner: InnerCompactCiphertextListExpander::Cuda(expander),
@@ -390,7 +391,8 @@ impl CompactCiphertextList {
                     dest_server_key: &cuda_key.key.key,
                 };
                 let streams = &cuda_key.streams;
-                let expander = gpu_inner.expand(&ksk, streams)?;
+                let expander =
+                    gpu_inner.expand(&ksk, crate::integer::gpu::ZKType::Casting, streams)?;
 
                 Ok(CompactCiphertextListExpander {
                     inner: InnerCompactCiphertextListExpander::Cuda(expander),
diff --git a/tfhe/src/integer/gpu/ciphertext/compact_list.rs b/tfhe/src/integer/gpu/ciphertext/compact_list.rs
index b5637a644..4fe9e5d82 100644
--- a/tfhe/src/integer/gpu/ciphertext/compact_list.rs
+++ b/tfhe/src/integer/gpu/ciphertext/compact_list.rs
@@ -21,7 +21,6 @@ use crate::GpuIndex;
 use itertools::Itertools;
 use serde::Deserializer;
 use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_to_gpu;
-
 #[derive(Clone)]
 pub struct CudaCompactCiphertextListInfo {
     pub info: CudaBlockInfo,
@@ -377,6 +376,7 @@ impl CudaFlattenedVecCompactCiphertextList {
     pub fn expand(
         &self,
         key: &CudaKeySwitchingKey,
+        zk_type: crate::integer::gpu::ZKType,
         streams: &CudaStreams,
     ) -> crate::Result<CudaCompactCiphertextListExpander> {
         assert!(
@@ -441,6 +441,7 @@ impl CudaFlattenedVecCompactCiphertextList {
                         LweBskGroupingFactor(0),
                         self.num_lwe_per_compact_list.as_slice(),
                         self.is_boolean.as_slice(),
+                        zk_type,
                         d_bsk.ms_noise_reduction_configuration.as_ref(),
                     );
                 }
@@ -476,6 +477,7 @@ impl CudaFlattenedVecCompactCiphertextList {
                         d_multibit_bsk.grouping_factor,
                         self.num_lwe_per_compact_list.as_slice(),
                         self.is_boolean.as_slice(),
+                        zk_type,
                         None,
                     );
                 }
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index 81a9eee08..32444a38e 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -80,6 +80,12 @@ pub enum ComparisonType {
     MAX = 6,
     MIN = 7,
 }
+#[repr(u32)]
+pub enum ZKType {
+    NoCasting = 0,
+    Casting = 1,
+    SanityCheck = 2,
+}
 
 fn resolve_noise_reduction_type(
     ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
@@ -7597,6 +7603,7 @@ pub(crate) unsafe fn cuda_backend_expand<T: UnsignedInteger, B: Numeric>(
     grouping_factor: LweBskGroupingFactor,
     num_lwes_per_compact_list: &[u32],
     is_boolean: &[bool],
+    zk_type: ZKType,
     ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
 ) {
     assert_eq!(
@@ -7665,6 +7672,7 @@ pub(crate) unsafe fn cuda_backend_expand<T: UnsignedInteger, B: Numeric>(
         pbs_type as u32,
         casting_key_type as u32,
         true,
+        zk_type as u32,
         noise_reduction_type as u32,
     );
     cuda_expand_without_verification_64(
@@ -10218,12 +10226,13 @@ pub unsafe fn unchecked_small_scalar_mul_integer_async(
 
     cuda_small_scalar_multiplication_integer_64_inplace(
         streams.ffi(),
-        &raw mut cuda_ffi_lwe_array,
+        &mut cuda_ffi_lwe_array,
         small_scalar,
         message_modulus.0 as u32,
         carry_modulus.0 as u32,
     );
 }
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/cpk_ks_ms.rs b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/cpk_ks_ms.rs
new file mode 100644
index 000000000..65137b358
--- /dev/null
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/cpk_ks_ms.rs
@@ -0,0 +1,818 @@
+use crate::integer::gpu::ciphertext::compact_list::CudaFlattenedVecCompactCiphertextList;
+
+use crate::core_crypto::commons::parameters::CiphertextModulusLog;
+use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey;
+use crate::shortint::encoding::ShortintEncoding;
+use crate::shortint::engine::ShortintEngine;
+use crate::shortint::parameters::test_params::TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128;
+use crate::shortint::parameters::{
+    AtomicPatternParameters, CarryModulus, CompactCiphertextListExpansionKind,
+    CompactPublicKeyEncryptionParameters, MetaParameters, ShortintCompactCiphertextListCastingMode,
+    ShortintKeySwitchingParameters,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{
+    DynLwe, NoiseSimulationLwe, NoiseSimulationLweKeyswitchKey, NoiseSimulationModulusSwitchConfig,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::{
+    mean_and_variance_check, normality_check, pfail_check, update_ap_params_for_pfail,
+    DecryptionAndNoiseResult, NoiseSample, PfailTestMeta, PfailTestResult,
+};
+use crate::shortint::server_key::tests::noise_distribution::{
+    should_run_short_pfail_tests_debug, should_use_single_key_debug,
+};
+
+use crate::shortint::server_key::tests::parameterized_test::create_parameterized_test;
+use crate::shortint::PaddingBit;
+use rayon::prelude::*;
+use crate::integer::gpu::server_key::radix::CudaUnsignedRadixCiphertext;
+use crate::integer::gpu::CudaServerKey;
+use crate::integer::{ClientKey, CompressedServerKey};
+use crate::GpuIndex;
+use crate::core_crypto::gpu::{CudaSideResources, CudaStreams};
+use crate::shortint::ShortintParameterSet;
+use crate::integer::gpu::key_switching_key::CudaKeySwitchingKey;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::shortint::server_key::tests::noise_distribution::cpk_ks_ms::cpk_ks_any_ms;
+use crate::integer::gpu::server_key::radix::tests_noise_distribution::utils::noise_simulation::CudaDynLwe;
+use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::lwe_programmable_bootstrap::LweClassicFftBootstrap;
+use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::lwe_programmable_bootstrap::AllocateLweBootstrapResult;
+use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::{
+    AllocateLweKeyswitchResult, LweKeyswitch,
+};
+use crate::core_crypto::gpu::algorithms::lwe_keyswitch::cuda_keyswitch_lwe_ciphertext;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::prelude::LweCiphertextCount;
+use crate::shortint::key_switching_key::CudaKeySwitchingKeyMaterial;
+use crate::integer::key_switching_key::KeySwitchingKey;
+use crate::integer::{CompactPublicKey, CompactPrivateKey};
+use crate::core_crypto::prelude::LweCiphertext;
+use crate::integer::ciphertext::DataKind;
+use std::num::NonZeroUsize;
+use crate::integer::gpu::server_key::radix::tests_noise_distribution::utils::key_switching_test_utils::new_key_switching_key_for_pfail_test;
+
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+fn cpk_ks_any_ms_inner_helper_gpu(
+    params: AtomicPatternParameters,
+    cpk_params: CompactPublicKeyEncryptionParameters,
+    ksk_ds_params: ShortintKeySwitchingParameters,
+    single_cpk_private_key: &CompactPrivateKey<Vec<u64>>,
+    single_cpk: &CompactPublicKey,
+    single_cuda_ksk: &CudaKeySwitchingKey<'_>,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+    DecryptionAndNoiseResult,
+) {
+    let mut engine = ShortintEngine::new();
+    let thread_cpk_private_key;
+    let thread_cpk;
+    let thread_cuda_ksk;
+    let thread_cks;
+    let thread_sks;
+    let thread_cuda_sks;
+    let thread_cuda_ksk_material;
+    let (cpk_private_key, cpk, cuda_ksk, cks, cuda_sks) = if should_use_single_key_debug() {
+        (
+            single_cpk_private_key,
+            single_cpk,
+            single_cuda_ksk,
+            single_cks,
+            single_cuda_sks,
+        )
+    } else {
+        thread_cpk_private_key = CompactPrivateKey::new(cpk_params);
+        thread_cpk = CompactPublicKey::new(&thread_cpk_private_key);
+
+        let block_params: ShortintParameterSet = params.into();
+        thread_cks = crate::integer::ClientKey::new(block_params);
+        let compressed_server_key =
+            CompressedServerKey::new_radix_compressed_server_key(&thread_cks);
+        thread_sks = compressed_server_key.decompress();
+        thread_cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+        let ksk = new_key_switching_key_for_pfail_test(
+            (&thread_cpk_private_key, None),
+            (&thread_cks, &thread_sks),
+            ksk_ds_params,
+        );
+        thread_cuda_ksk_material =
+            CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams);
+        thread_cuda_ksk = CudaKeySwitchingKey::from_cuda_key_switching_key_material(
+            &thread_cuda_ksk_material,
+            &thread_cuda_sks,
+        );
+
+        (
+            &thread_cpk_private_key,
+            &thread_cpk,
+            &thread_cuda_ksk,
+            &thread_cks,
+            &thread_cuda_sks,
+        )
+    };
+
+    //let br_input_modulus_log = sks.br_input_modulus_log();
+    let modulus_switch_config = cuda_sks.noise_simulation_modulus_switch_config();
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: crate::shortint::AtomicPatternKind::Standard(
+            crate::shortint::PBSOrder::KeyswitchBootstrap,
+        ),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources = CudaSideResources::new(streams, cuda_block_info);
+    let ct = {
+        let compact_list = cpk.key.encrypt_iter_with_modulus_with_engine(
+            core::iter::once(msg),
+            cpk.key.parameters.message_modulus.0,
+            &mut engine,
+        );
+
+        let num_blocks = 1usize;
+
+        let data_info = vec![DataKind::Unsigned(NonZeroUsize::new(num_blocks).unwrap())];
+        let cuda_casting_compact_list =
+            CudaFlattenedVecCompactCiphertextList::from_vec_shortint_compact_ciphertext_list(
+                vec![compact_list.clone()],
+                data_info,
+                &cuda_side_resources.streams,
+            );
+        let cuda_compact_list_expander = cuda_casting_compact_list
+            .expand(
+                &cuda_ksk,
+                crate::integer::gpu::ZKType::NoCasting,
+                &cuda_side_resources.streams,
+            )
+            .unwrap();
+
+        let cuda_expanded_ct: CudaUnsignedRadixCiphertext = cuda_compact_list_expander
+            .get(0usize, &cuda_side_resources.streams)
+            .unwrap()
+            .unwrap();
+
+        CudaDynLwe::U64(cuda_expanded_ct.ciphertext.d_blocks)
+    };
+
+    let (input_gpu, after_ks_ds_gpu, after_drift_gpu, after_ms_gpu) = cpk_ks_any_ms(
+        ct,
+        cuda_ksk,
+        modulus_switch_config,
+        br_input_modulus_log,
+        &mut cuda_side_resources,
+    );
+    let input_list = input_gpu
+        .as_lwe_64()
+        .to_lwe_ciphertext_list(&cuda_side_resources.streams);
+    let input_ct = LweCiphertext::from_container(
+        input_list.clone().into_container(),
+        input_list.ciphertext_modulus(),
+    );
+    let input = DynLwe::U64(input_ct);
+
+    let after_ks_ds_list = after_ks_ds_gpu
+        .as_lwe_64()
+        .to_lwe_ciphertext_list(&cuda_side_resources.streams);
+    let after_ks_ds_ct = LweCiphertext::from_container(
+        after_ks_ds_list.clone().into_container(),
+        after_ks_ds_list.ciphertext_modulus(),
+    );
+    let after_ks_ds = DynLwe::U64(after_ks_ds_ct);
+
+    let before_ms_gpu: &CudaDynLwe = after_drift_gpu.as_ref().unwrap_or(&after_ks_ds_gpu);
+    let before_ms_list = before_ms_gpu
+        .as_lwe_64()
+        .to_lwe_ciphertext_list(&cuda_side_resources.streams);
+    let before_ms_ct = LweCiphertext::from_container(
+        before_ms_list.clone().into_container(),
+        before_ms_list.ciphertext_modulus(),
+    );
+    let before_ms = DynLwe::U64(before_ms_ct);
+    let after_ms_list = after_ms_gpu
+        .as_lwe_64()
+        .to_lwe_ciphertext_list(&cuda_side_resources.streams);
+    let after_ms_ct = LweCiphertext::from_container(
+        after_ms_list.clone().into_container(),
+        after_ms_list.ciphertext_modulus(),
+    );
+    let after_ms = DynLwe::U64(after_ms_ct);
+    match &cks.key.atomic_pattern {
+        AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => {
+            let params = standard_atomic_pattern_client_key.parameters;
+            let encoding = ShortintEncoding {
+                ciphertext_modulus: params.ciphertext_modulus(),
+                message_modulus: params.message_modulus(),
+                carry_modulus: params.carry_modulus(),
+                padding_bit: PaddingBit::Yes,
+            };
+
+            let cpk_lwe_secret_key = cpk_private_key.key.key();
+
+            let small_compute_lwe_secret_key =
+                standard_atomic_pattern_client_key.small_lwe_secret_key();
+            (
+                DecryptionAndNoiseResult::new_from_lwe(
+                    &input.as_lwe_64(),
+                    &cpk_lwe_secret_key,
+                    msg,
+                    &encoding,
+                ),
+                DecryptionAndNoiseResult::new_from_lwe(
+                    &after_ks_ds.as_lwe_64(),
+                    &small_compute_lwe_secret_key,
+                    msg,
+                    &encoding,
+                ),
+                DecryptionAndNoiseResult::new_from_lwe(
+                    &before_ms.as_lwe_64(),
+                    &small_compute_lwe_secret_key,
+                    msg,
+                    &encoding,
+                ),
+                DecryptionAndNoiseResult::new_from_lwe(
+                    &after_ms.as_lwe_64(),
+                    &small_compute_lwe_secret_key,
+                    msg,
+                    &encoding,
+                ),
+            )
+        }
+        AtomicPatternClientKey::KeySwitch32(_ks32_atomic_pattern_client_key) => {
+            panic!("KeySwitch32 atomic pattern is not supported on GPU yet");
+        }
+    }
+}
+
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+fn cpk_ks_any_ms_noise_helper_gpu(
+    params: AtomicPatternParameters,
+    cpk_params: CompactPublicKeyEncryptionParameters,
+    ksk_ds_params: ShortintKeySwitchingParameters,
+    single_cpk_private_key: &CompactPrivateKey<Vec<u64>>,
+    single_cpk: &CompactPublicKey,
+    single_cuda_ksk_ds: &CudaKeySwitchingKey<'_>,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (NoiseSample, NoiseSample, NoiseSample, NoiseSample) {
+    let (input, after_ks_ds, before_ms, after_ms) = cpk_ks_any_ms_inner_helper_gpu(
+        params,
+        cpk_params,
+        ksk_ds_params,
+        single_cpk_private_key,
+        single_cpk,
+        single_cuda_ksk_ds,
+        single_cks,
+        single_cuda_sks,
+        msg,
+        br_input_modulus_log,
+        streams,
+    );
+
+    (
+        input
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_ks_ds
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        before_ms
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+        after_ms
+            .get_noise_if_decryption_was_correct()
+            .expect("Decryption Failed"),
+    )
+}
+
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+fn cpk_ks_any_ms_pfail_helper_gpu(
+    params: AtomicPatternParameters,
+    cpk_params: CompactPublicKeyEncryptionParameters,
+    ksk_ds_params: ShortintKeySwitchingParameters,
+    single_cpk_private_key: &CompactPrivateKey<Vec<u64>>,
+    single_cpk: &CompactPublicKey,
+    single_cuda_ksk_ds: &CudaKeySwitchingKey<'_>,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    msg: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> DecryptionAndNoiseResult {
+    let (_input, _after_ks_ds, _before_ms, after_ms) = cpk_ks_any_ms_inner_helper_gpu(
+        params,
+        cpk_params,
+        ksk_ds_params,
+        single_cpk_private_key,
+        single_cpk,
+        single_cuda_ksk_ds,
+        single_cks,
+        single_cuda_sks,
+        msg,
+        br_input_modulus_log,
+        streams,
+    );
+
+    after_ms
+}
+
+fn noise_check_encrypt_cpk_ks_ms_noise_gpu(meta_params: MetaParameters) {
+    let (params, cpk_params, ksk_ds_params) = {
+        let compute_params = meta_params.compute_parameters;
+        let dedicated_cpk_params = meta_params.dedicated_compact_public_key_parameters.unwrap();
+        // To avoid the expand logic of shortint which would force a keyswitch + LUT eval after
+        // expand
+        let cpk_params = {
+            let mut cpk_params = dedicated_cpk_params.pke_params;
+            cpk_params.expansion_kind = CompactCiphertextListExpansionKind::NoCasting(
+                compute_params.encryption_key_choice().into_pbs_order(),
+            );
+            cpk_params
+        };
+
+        (compute_params, cpk_params, dedicated_cpk_params.ksk_params)
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+    let cpk_private_key = CompactPrivateKey::new(cpk_params);
+    let cpk = CompactPublicKey::new(&cpk_private_key);
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let sks = compressed_server_key.decompress();
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+    let ksk = KeySwitchingKey::new((&cpk_private_key, None), (&cks, &sks), ksk_ds_params);
+    let cuda_ksk_material = CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams);
+    let cuda_ksk =
+        CudaKeySwitchingKey::from_cuda_key_switching_key_material(&cuda_ksk_material, &cuda_sks);
+
+    let noise_simulation_ksk =
+        NoiseSimulationLweKeyswitchKey::new_from_atomic_pattern_parameters(params);
+    let noise_simulation_ksk_ds =
+        NoiseSimulationLweKeyswitchKey::new_from_cpk_params(cpk_params, ksk_ds_params, params);
+    let noise_simulation_modulus_switch_config =
+        NoiseSimulationModulusSwitchConfig::new_from_atomic_pattern_parameters(params);
+
+    let modulus_switch_config = sks.key.noise_simulation_modulus_switch_config();
+    let cuda_modulus_switch_config = cuda_sks.noise_simulation_modulus_switch_config();
+    let compute_br_input_modulus_log = sks.key.br_input_modulus_log();
+    let expected_average_after_ms =
+        modulus_switch_config.expected_average_after_ms(params.polynomial_size());
+
+    assert!(noise_simulation_ksk.matches_actual_shortint_server_key(&sks.key));
+    assert!(noise_simulation_ksk_ds.matches_actual_shortint_keyswitching_key(&ksk.key.as_view()));
+    assert!(noise_simulation_modulus_switch_config
+        .matches_shortint_server_key_modulus_switch_config(modulus_switch_config));
+
+    let (_input_sim, _after_ks_ds_sim, _after_drift_sim, after_ms_sim) = {
+        let noise_simulation_input = NoiseSimulationLwe::encrypt_with_cpk(&cpk.key);
+        cpk_ks_any_ms(
+            noise_simulation_input,
+            &noise_simulation_ksk_ds,
+            noise_simulation_modulus_switch_config.as_ref(),
+            compute_br_input_modulus_log,
+            &mut (),
+        )
+    };
+
+    let sample_input = {
+        let compact_list = cpk.key.encrypt_slice(&[0]);
+        let mut expanded = compact_list
+            .expand(ShortintCompactCiphertextListCastingMode::NoCasting)
+            .unwrap();
+        assert_eq!(expanded.len(), 1);
+
+        DynLwe::U64(expanded.pop().unwrap().ct)
+    };
+    let d_ct_input =
+        CudaLweCiphertextList::from_lwe_ciphertext(&sample_input.as_lwe_64(), &streams);
+    let gpu_sample_input = CudaDynLwe::U64(d_ct_input);
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: crate::shortint::AtomicPatternKind::Standard(
+            crate::shortint::PBSOrder::KeyswitchBootstrap,
+        ),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources = CudaSideResources::new(&streams, cuda_block_info);
+    // Check that the circuit is correct with respect to core implementation, i.e. does not crash on
+    // dimension checks
+    let (expected_lwe_dimension_out, expected_modulus_f64_out) = {
+        let (_input, _after_ks_ds, _before_ms, after_ms) = cpk_ks_any_ms(
+            gpu_sample_input,
+            &cuda_ksk,
+            cuda_modulus_switch_config,
+            compute_br_input_modulus_log,
+            &mut cuda_side_resources,
+        );
+
+        (after_ms.lwe_dimension(), after_ms.raw_modulus_float())
+    };
+
+    assert_eq!(after_ms_sim.lwe_dimension(), expected_lwe_dimension_out);
+    assert_eq!(after_ms_sim.modulus().as_f64(), expected_modulus_f64_out);
+
+    let cleartext_modulus = params.message_modulus().0 * params.carry_modulus().0;
+    let mut noise_samples_before_ms = vec![];
+    let mut noise_samples_after_ms = vec![];
+
+    let sample_count_per_msg = 1000usize;
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+
+    for _ in 0..cleartext_modulus {
+        let (current_noise_sample_before_ms, current_noise_samples_after_ms): (Vec<_>, Vec<_>) = (0
+            ..sample_count_per_msg)
+            .collect::<Vec<_>>()
+            .chunks(chunk_size)
+            .flat_map(|chunk| {
+                chunk
+                    .iter()
+                    .collect::<Vec<_>>()
+                    .into_par_iter()
+                    .map(|i| {
+                        let local_stream = &vec_local_streams[*i % chunk_size];
+                        let (_input, _after_ks_ds, before_ms, after_ms) =
+                            cpk_ks_any_ms_noise_helper_gpu(
+                                params,
+                                cpk_params,
+                                ksk_ds_params,
+                                &cpk_private_key,
+                                &cpk,
+                                &cuda_ksk,
+                                &cks,
+                                &cuda_sks,
+                                0,
+                                compute_br_input_modulus_log,
+                                local_stream,
+                            );
+                        (before_ms.value, after_ms.value)
+                    })
+                    .collect::<Vec<_>>()
+            })
+            .unzip();
+
+        noise_samples_before_ms.extend(current_noise_sample_before_ms);
+        noise_samples_after_ms.extend(current_noise_samples_after_ms);
+    }
+
+    let before_ms_normality = normality_check(&noise_samples_before_ms, "before ms", 0.01);
+
+    let after_ms_is_ok = mean_and_variance_check(
+        &noise_samples_after_ms,
+        "after_ms",
+        expected_average_after_ms,
+        after_ms_sim.variance(),
+        params.lwe_noise_distribution(),
+        after_ms_sim.lwe_dimension(),
+        after_ms_sim.modulus().as_f64(),
+    );
+
+    assert!(before_ms_normality.null_hypothesis_is_valid && after_ms_is_ok);
+}
+
+create_parameterized_test!(noise_check_encrypt_cpk_ks_ms_noise_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+fn noise_check_encrypt_cpk_ks_ms_pfail_gpu(meta_params: MetaParameters) {
+    let (params, cpk_params, ksk_ds_params) = {
+        let compute_params = meta_params.compute_parameters;
+        let dedicated_cpk_params = meta_params.dedicated_compact_public_key_parameters.unwrap();
+        // To avoid the expand logic of shortint which would force a keyswitch + LUT eval after
+        // expand
+        let cpk_params = {
+            let mut cpk_params = dedicated_cpk_params.pke_params;
+            cpk_params.expansion_kind = CompactCiphertextListExpansionKind::NoCasting(
+                compute_params.encryption_key_choice().into_pbs_order(),
+            );
+            cpk_params
+        };
+
+        (compute_params, cpk_params, dedicated_cpk_params.ksk_params)
+    };
+
+    let (pfail_test_meta, params) = {
+        let mut ap_params = params;
+
+        let original_message_modulus = ap_params.message_modulus();
+        let original_carry_modulus = ap_params.carry_modulus();
+
+        // For now only allow 2_2 parameters, and see later for heuristics to use
+        assert_eq!(original_message_modulus.0, 4);
+        assert_eq!(original_carry_modulus.0, 4);
+
+        // Update parameters to fail more frequently by inflating the carry modulus, allows to keep
+        // the max multiplication without risks of message overflow
+        let (original_pfail_and_precision, new_expected_pfail_and_precision) =
+            update_ap_params_for_pfail(
+                &mut ap_params,
+                original_message_modulus,
+                CarryModulus(1 << 5),
+            );
+
+        let pfail_test_meta = if should_run_short_pfail_tests_debug() {
+            let expected_fails = 200;
+            PfailTestMeta::new_with_desired_expected_fails(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                expected_fails,
+            )
+        } else {
+            let total_runs = 1_000_000;
+            PfailTestMeta::new_with_total_runs(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                total_runs,
+            )
+        };
+
+        (pfail_test_meta, ap_params)
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+    let cpk_private_key = CompactPrivateKey::new(cpk_params);
+    let cpk = CompactPublicKey::new(&cpk_private_key);
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let sks = compressed_server_key.decompress();
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+    let ksk =
+        new_key_switching_key_for_pfail_test((&cpk_private_key, None), (&cks, &sks), ksk_ds_params);
+    let cuda_ksk_material = CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams);
+    let cuda_ksk =
+        CudaKeySwitchingKey::from_cuda_key_switching_key_material(&cuda_ksk_material, &cuda_sks);
+
+    let total_runs_for_expected_fails = pfail_test_meta.total_runs_for_expected_fails();
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    let measured_fails: f64 = (0..total_runs_for_expected_fails)
+        .collect::<Vec<_>>()
+        .chunks(chunk_size)
+        .flat_map(|chunk| {
+            chunk
+                .iter()
+                .collect::<Vec<_>>()
+                .into_par_iter()
+                .map(|i| {
+                    let local_stream = &vec_local_streams[*i as usize % chunk_size];
+                    let after_ms_decryption_result = cpk_ks_any_ms_pfail_helper_gpu(
+                        params,
+                        cpk_params,
+                        ksk_ds_params,
+                        &cpk_private_key,
+                        &cpk,
+                        &cuda_ksk,
+                        &cks,
+                        &cuda_sks,
+                        0,
+                        sks.key.br_input_modulus_log(),
+                        local_stream,
+                    );
+                    after_ms_decryption_result.failure_as_f64()
+                })
+                .collect::<Vec<_>>()
+        })
+        .sum();
+
+    let test_result = PfailTestResult { measured_fails };
+
+    pfail_check(&pfail_test_meta, test_result);
+}
+
+create_parameterized_test!(noise_check_encrypt_cpk_ks_ms_pfail_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+fn sanity_check_encrypt_cpk_ks_ms_pbs_gpu(meta_params: MetaParameters) {
+    let (params, cpk_params, ksk_ds_params) = {
+        let compute_params = meta_params.compute_parameters;
+        let dedicated_cpk_params = meta_params.dedicated_compact_public_key_parameters.unwrap();
+        // To avoid the expand logic of shortint which would force a keyswitch + LUT eval after
+        // expand
+        let (cpk_params, orig_cast_mode) = {
+            let mut cpk_params = dedicated_cpk_params.pke_params;
+            let orig_cast_mode = cpk_params.expansion_kind;
+            cpk_params.expansion_kind = CompactCiphertextListExpansionKind::NoCasting(
+                compute_params.encryption_key_choice().into_pbs_order(),
+            );
+            (cpk_params, orig_cast_mode)
+        };
+
+        assert!(matches!(
+            orig_cast_mode,
+            CompactCiphertextListExpansionKind::RequiresCasting
+        ));
+
+        (compute_params, cpk_params, dedicated_cpk_params.ksk_params)
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+    let cpk_private_key = CompactPrivateKey::new(cpk_params);
+    let cpk = CompactPublicKey::new(&cpk_private_key);
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let sks = compressed_server_key.decompress();
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+    let ksk = KeySwitchingKey::new((&cpk_private_key, None), (&cks, &sks), ksk_ds_params);
+    let cuda_ksk_material = CudaKeySwitchingKeyMaterial::from_key_switching_key(&ksk, &streams);
+    let cuda_ksk =
+        CudaKeySwitchingKey::from_cuda_key_switching_key_material(&cuda_ksk_material, &cuda_sks);
+    let modulus_switch_config = cuda_sks.noise_simulation_modulus_switch_config();
+    let compute_br_input_modulus_log = sks.key.br_input_modulus_log();
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams);
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: crate::shortint::AtomicPatternKind::Standard(
+            crate::shortint::PBSOrder::KeyswitchBootstrap,
+        ),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources = CudaSideResources::new(&streams, cuda_block_info);
+
+    for _ in 0..10 {
+        let (gpu_sample_input, shortint_res) = {
+            let mut engine = ShortintEngine::new();
+            let no_casting_compact_list = cpk.key.encrypt_iter_with_modulus_with_engine(
+                core::iter::once(0),
+                cpk.key.parameters.message_modulus.0,
+                &mut engine,
+            );
+
+            let num_blocks = 1usize;
+            let data_info = vec![DataKind::Unsigned(NonZeroUsize::new(num_blocks).unwrap())];
+            //This is for the ap
+            let cuda_no_casting_compact_list =
+                CudaFlattenedVecCompactCiphertextList::from_vec_shortint_compact_ciphertext_list(
+                    vec![no_casting_compact_list.clone()],
+                    data_info,
+                    &cuda_side_resources.streams,
+                );
+
+            //This is for the verification
+            let cuda_casting_compact_list =
+                cuda_no_casting_compact_list.duplicate(&cuda_side_resources.streams);
+
+            let cuda_no_casting_compact_list_expander = cuda_no_casting_compact_list
+                .expand(
+                    &cuda_ksk,
+                    crate::integer::gpu::ZKType::NoCasting,
+                    &cuda_side_resources.streams,
+                )
+                .unwrap();
+
+            let cuda_ap_input_expanded: CudaUnsignedRadixCiphertext =
+                cuda_no_casting_compact_list_expander
+                    .get(0usize, &cuda_side_resources.streams)
+                    .unwrap()
+                    .unwrap();
+
+            let cuda_casting_compact_list_expander = cuda_casting_compact_list
+                .expand(
+                    &cuda_ksk,
+                    crate::integer::gpu::ZKType::SanityCheck,
+                    &cuda_side_resources.streams,
+                )
+                .unwrap();
+
+            let cuda_int_res: CudaUnsignedRadixCiphertext = cuda_casting_compact_list_expander
+                .get(0usize, &cuda_side_resources.streams)
+                .unwrap()
+                .unwrap();
+
+            (
+                CudaDynLwe::U64(
+                    cuda_ap_input_expanded
+                        .ciphertext
+                        .d_blocks
+                        .duplicate(&cuda_side_resources.streams),
+                ),
+                cuda_int_res
+                    .ciphertext
+                    .d_blocks
+                    .to_lwe_ciphertext_list(&cuda_side_resources.streams),
+            )
+        };
+
+        let (_input, _after_ks, _before_ms, after_ms) = cpk_ks_any_ms(
+            gpu_sample_input,
+            &cuda_ksk,
+            modulus_switch_config,
+            compute_br_input_modulus_log,
+            &mut cuda_side_resources,
+        );
+
+        // Complete the AP by computing the PBS to match shortint
+        let mut pbs_result = d_accumulator.allocate_lwe_bootstrap_result(&mut cuda_side_resources);
+        cuda_sks.lwe_classic_fft_pbs(
+            &after_ms,
+            &mut pbs_result,
+            &d_accumulator,
+            &mut cuda_side_resources,
+        );
+
+        let pbs_result_list = pbs_result
+            .as_lwe_64()
+            .to_lwe_ciphertext_list(&cuda_side_resources.streams);
+
+        assert_eq!(pbs_result_list, shortint_res);
+    }
+}
+
+// Trait implementations for CudaKeySwitchingKey to enable noise distribution tests
+impl AllocateLweKeyswitchResult for CudaKeySwitchingKey<'_> {
+    type Output = CudaDynLwe;
+    type SideResources = CudaSideResources;
+
+    fn allocate_lwe_keyswitch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        let output_lwe_dimension = self
+            .key_switching_key_material
+            .lwe_keyswitch_key
+            .output_key_lwe_size()
+            .to_lwe_dimension();
+        let lwe_ciphertext_count = LweCiphertextCount(1);
+        let ciphertext_modulus = self.dest_server_key.ciphertext_modulus;
+
+        let cuda_lwe = CudaLweCiphertextList::new(
+            output_lwe_dimension,
+            lwe_ciphertext_count,
+            ciphertext_modulus,
+            &side_resources.streams,
+        );
+        CudaDynLwe::U64(cuda_lwe)
+    }
+}
+
+impl LweKeyswitch<CudaDynLwe, CudaDynLwe> for CudaKeySwitchingKey<'_> {
+    type SideResources = CudaSideResources;
+
+    fn lwe_keyswitch(
+        &self,
+        input: &CudaDynLwe,
+        output: &mut CudaDynLwe,
+        side_resources: &mut Self::SideResources,
+    ) {
+        match (input, output) {
+            (CudaDynLwe::U64(input_cuda_lwe), CudaDynLwe::U64(output_cuda_lwe)) => {
+                let d_input_indexes = CudaVec::<u64>::new(1, &side_resources.streams, 0);
+                let d_output_indexes = CudaVec::<u64>::new(1, &side_resources.streams, 0);
+
+                cuda_keyswitch_lwe_ciphertext(
+                    &self.key_switching_key_material.lwe_keyswitch_key,
+                    input_cuda_lwe,
+                    output_cuda_lwe,
+                    &d_input_indexes,
+                    &d_output_indexes,
+                    false,
+                    &side_resources.streams,
+                    false,
+                );
+            }
+            (CudaDynLwe::U32(_), CudaDynLwe::U32(_)) => {
+                panic!(
+                    "U32 keyswitch not implemented for CudaKeySwitchingKey - only U64 is supported"
+                );
+            }
+            (CudaDynLwe::U128(_), CudaDynLwe::U128(_)) => {
+                panic!("U128 keyswitch not implemented for CudaKeySwitchingKey - only U64 is supported");
+            }
+            _ => panic!("Inconsistent input/output types for CudaDynLwe keyswitch"),
+        }
+    }
+}
+
+create_parameterized_test!(sanity_check_encrypt_cpk_ks_ms_pbs_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
index 6ae93d65e..a9c57038a 100644
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
@@ -1,5 +1,6 @@
 pub mod br_dp_ks_ms;
 pub mod br_dp_packingks_ms;
+pub mod cpk_ks_ms;
 pub mod dp_ks_ms;
 pub mod dp_ks_pbs_128_packingks;
 pub mod utils;
diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/key_switching_test_utils.rs b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/key_switching_test_utils.rs
new file mode 100644
index 000000000..a31134f29
--- /dev/null
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/key_switching_test_utils.rs
@@ -0,0 +1,28 @@
+use crate::integer::client_key::secret_encryption_key::SecretEncryptionKeyView;
+use crate::integer::key_switching_key::KeySwitchingKey;
+use crate::integer::{ClientKey, ServerKey};
+use crate::shortint::parameters::ShortintKeySwitchingParameters;
+
+/// Test-only implementation of KeySwitchingKey::new that skips the cast_rshift assertion.
+/// This is needed for pfail tests where we intentionally use different message modulus and carry.
+pub fn new_key_switching_key_for_pfail_test<'input_key, InputEncryptionKey, ClientKeyType>(
+    input_key_pair: (InputEncryptionKey, Option<&ServerKey>),
+    output_key_pair: (&ClientKeyType, &ServerKey),
+    params: ShortintKeySwitchingParameters,
+) -> KeySwitchingKey
+where
+    InputEncryptionKey: Into<SecretEncryptionKeyView<'input_key>>,
+    ClientKeyType: AsRef<ClientKey>,
+{
+    let input_secret_encryption_key: SecretEncryptionKeyView<'_> = input_key_pair.0.into();
+    KeySwitchingKey {
+        key: crate::shortint::KeySwitchingKey::new(
+            (
+                input_secret_encryption_key.key,
+                input_key_pair.1.map(|k| &k.key),
+            ),
+            (&output_key_pair.0.as_ref().key, &output_key_pair.1.key),
+            params,
+        ),
+    }
+}
diff --git a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/mod.rs b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/mod.rs
index 44d0c45d7..af1f3f5f0 100644
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/mod.rs
@@ -1 +1,2 @@
+pub mod key_switching_test_utils;
 pub mod noise_simulation;
diff --git a/tfhe/src/integer/gpu/zk/mod.rs b/tfhe/src/integer/gpu/zk/mod.rs
index a08565334..af829499d 100644
--- a/tfhe/src/integer/gpu/zk/mod.rs
+++ b/tfhe/src/integer/gpu/zk/mod.rs
@@ -73,7 +73,8 @@ impl CudaProvenCompactCiphertextList {
         key: &CudaKeySwitchingKey,
         streams: &CudaStreams,
     ) -> crate::Result<CudaCompactCiphertextListExpander> {
-        self.d_flattened_compact_lists.expand(key, streams)
+        self.d_flattened_compact_lists
+            .expand(key, super::ZKType::Casting, streams)
     }
 
     pub fn from_proven_compact_ciphertext_list(