chore(bench): add missing operation in hlapi

chore: fix wasm-pack URL and update build output listing
Corrected the URL for 'wasm-pack' and updated the file listing after the build. co-authored-by: d4wae89d498 <faussurier.marc@icloud.com>
2026-01-13 00:28:24 -05:00 · 2026-01-12 14:58:08 +01:00 · 2026-01-12 12:51:04 +01:00 · 2026-01-12 11:00:52 +01:00 · 2026-01-12 11:00:52 +01:00 · 2026-01-12 10:46:41 +01:00
36 changed files with 2960 additions and 254 deletions
--- a/.cargo/audit.toml
+++ b/.cargo/audit.toml
@@ -2,6 +2,8 @@
 ignore = [
    # Ignoring unmaintained 'paste' advisory as it is a widely used, low-risk build dependency.
    "RUSTSEC-2024-0436",
+    # Ignoring unmaintained 'bincode' crate. Getting rid of it would be too complex on the short term.
+    "RUSTSEC-2025-0141",
 ]

 [output]
--- a/.github/actions/gpu_setup/action.yml
+++ b/.github/actions/gpu_setup/action.yml
@@ -23,6 +23,8 @@ runs:
        echo "${CMAKE_SCRIPT_SHA} cmake-${CMAKE_VERSION}-linux-x86_64.sh" > checksum
        sha256sum -c checksum
        sudo bash cmake-"${CMAKE_VERSION}"-linux-x86_64.sh --skip-license --prefix=/usr/ --exclude-subdir
+        sudo apt-get clean
+        sudo rm -rf /var/lib/apt/lists/*
        sudo apt update
        sudo apt remove -y unattended-upgrades
        sudo apt install -y cmake-format libclang-dev
--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ target/
 **/*.rmeta
 **/Cargo.lock
 **/*.bin
+**/.DS_Store

 # Some of our bench outputs
 /tfhe/benchmarks_parameters
--- a/6
+++ b/6
@@ -11,7 +11,7 @@
 /tfhe/src/core_crypto/gpu               @agnesLeroy
 /tfhe/src/core_crypto/hpu               @zama-ai/hardware

-/tfhe/src/shortint/                     @mayeul-zama
+/tfhe/src/shortint/                     @mayeul-zama @nsarlin-zama

 /tfhe/src/integer/                      @tmontaigu
 /tfhe/src/integer/gpu                   @agnesLeroy
@@ -19,8 +19,12 @@

 /tfhe/src/high_level_api/               @tmontaigu

+/tfhe-zk-pok/                           @nsarlin-zama
+
 /tfhe-benchmark/                        @soonum

+/utils/                                 @nsarlin-zama
+
 /Makefile                               @IceTDrinker @soonum

 /mockups/tfhe-hpu-mockup                @zama-ai/hardware
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,6 +36,7 @@ rayon = "1.11"
 serde = { version = "1.0", default-features = false }
 wasm-bindgen = "0.2.101"
 getrandom = "0.2.8"
+# The project maintainers consider that this is the last version of the 1.3 branch, any newer version should not be trusted
 bincode = "=1.3.3"

 [profile.bench]
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h
@@ -65,6 +65,16 @@ void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,

 void cleanup_cuda_integer_decompress_radix_ciphertext_128(
    CudaStreamsFFI streams, int8_t **mem_ptr_void);
+
+void cuda_integer_extract_glwe_128(
+    CudaStreamsFFI streams, void *glwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_list,
+    uint32_t const glwe_index);
+
+void cuda_integer_extract_glwe_64(
+    CudaStreamsFFI streams, void *glwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_list,
+    uint32_t const glwe_index);
 }

 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -155,3 +155,24 @@ void cleanup_cuda_integer_decompress_radix_ciphertext_128(
  delete mem_ptr;
  *mem_ptr_void = nullptr;
 }
+
+void cuda_integer_extract_glwe_128(
+    CudaStreamsFFI streams, void *glwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_list,
+    uint32_t const glwe_index) {
+
+  CudaStreams _streams = CudaStreams(streams);
+  host_extract<__uint128_t>(_streams.stream(0), _streams.gpu_index(0),
+                            (__uint128_t *)glwe_array_out, glwe_list,
+                            glwe_index);
+}
+
+void cuda_integer_extract_glwe_64(
+    CudaStreamsFFI streams, void *glwe_array_out,
+    CudaPackedGlweCiphertextListFFI const *glwe_list,
+    uint32_t const glwe_index) {
+
+  CudaStreams _streams = CudaStreams(streams);
+  host_extract<__uint64_t>(_streams.stream(0), _streams.gpu_index(0),
+                           (__uint64_t *)glwe_array_out, glwe_list, glwe_index);
+}
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -2349,6 +2349,22 @@ unsafe extern "C" {
        mem_ptr_void: *mut *mut i8,
    );
 }
+unsafe extern "C" {
+    pub fn cuda_integer_extract_glwe_128(
+        streams: CudaStreamsFFI,
+        glwe_array_out: *mut ffi::c_void,
+        glwe_list: *const CudaPackedGlweCiphertextListFFI,
+        glwe_index: u32,
+    );
+}
+unsafe extern "C" {
+    pub fn cuda_integer_extract_glwe_64(
+        streams: CudaStreamsFFI,
+        glwe_array_out: *mut ffi::c_void,
+        glwe_list: *const CudaPackedGlweCiphertextListFFI,
+        glwe_index: u32,
+    );
+}
 unsafe extern "C" {
    pub fn scratch_cuda_rerand_64(
        streams: CudaStreamsFFI,
--- a/backends/tfhe-hpu-backend/Cargo.toml
+++ b/backends/tfhe-hpu-backend/Cargo.toml
@@ -40,7 +40,7 @@ rand = "0.8.5"
 regex = "1.10.4"
 bitflags = { version = "2.5.0", features = ["serde"] }
 itertools = "0.11.0"
-lru = "0.12.3"
+lru = "0.16.3"
 bitfield-struct = "0.10.0"
 crossbeam = { version = "0.8.4", features = ["crossbeam-queue"] }
 rayon = { workspace = true }
--- a/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
+++ b/backends/tfhe-hpu-backend/config_store/v80_archives/psi64.hpu
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:35cc06547a23b862ab9829351d74d944e60ea9dad3ecf593d15f0ce8445d145e
-size 81710610
+oid sha256:934c8131c12010dc837f6a2af5111b83f8f5d42f10485e9b3b971edb24c467f8
+size 82201876
--- a/backends/tfhe-hpu-backend/src/fw/program.rs
+++ b/backends/tfhe-hpu-backend/src/fw/program.rs
@@ -160,9 +160,9 @@ impl ProgramInner {
            .filter(|(_, var)| var.is_none())
            .map(|(rid, _)| *rid)
            .collect::<Vec<_>>();
-        demote_order
-            .into_iter()
-            .for_each(|rid| self.regs.demote(&rid));
+        demote_order.into_iter().for_each(|rid| {
+            self.regs.demote(&rid);
+        });
    }

    /// Release register entry
@@ -179,7 +179,7 @@ impl ProgramInner {

    /// Notify register access to update LRU state
    pub(crate) fn reg_access(&mut self, rid: asm::RegId) {
-        self.regs.promote(&rid)
+        self.regs.promote(&rid);
    }

    /// Retrieved least-recent-used heap entry
@@ -220,9 +220,9 @@ impl ProgramInner {
                    .filter(|(_mid, var)| var.is_none())
                    .map(|(mid, _)| *mid)
                    .collect::<Vec<_>>();
-                demote_order
-                    .into_iter()
-                    .for_each(|mid| self.heap.demote(&mid));
+                demote_order.into_iter().for_each(|mid| {
+                    self.heap.demote(&mid);
+                });
            }
            _ => { /*Only release Heap slot*/ }
        }
@@ -231,7 +231,9 @@ impl ProgramInner {
    /// Notify heap access to update LRU state
    pub(crate) fn heap_access(&mut self, mid: asm::MemId) {
        match mid {
-            asm::MemId::Heap { .. } => self.heap.promote(&mid),
+            asm::MemId::Heap { .. } => {
+                self.heap.promote(&mid);
+            }
            _ => { /* Do Nothing slot do not below to heap*/ }
        }
    }
--- a/tfhe-benchmark/.gitignore
+++ b/tfhe-benchmark/.gitignore
@@ -0,0 +1 @@
+benchmarks_parameters/*
--- a/tfhe-benchmark/benches/high_level_api/bench.rs
+++ b/tfhe-benchmark/benches/high_level_api/bench.rs
@@ -2,7 +2,9 @@ use benchmark::utilities::{
    hlapi_throughput_num_ops, write_to_json, BenchmarkType, BitSizesSet, EnvConfig, OperatorType,
 };
 use criterion::{black_box, Criterion, Throughput};
+use oprf::oprf_any_range2;
 use rand::prelude::*;
+use rayon::prelude::*;
 use std::marker::PhantomData;
 use std::ops::*;
 use tfhe::core_crypto::prelude::Numeric;
@@ -11,34 +13,104 @@ use tfhe::keycache::NamedParam;
 use tfhe::named::Named;
 use tfhe::prelude::*;
 use tfhe::{
-    ClientKey, CompressedServerKey, FheIntegerType, FheUint10, FheUint12, FheUint128, FheUint14,
-    FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8, FheUintId, IntegerId,
-    KVStore,
+    ClientKey, CompressedServerKey, FheBool, FheIntegerType, FheUint, FheUint10, FheUint12,
+    FheUint128, FheUint14, FheUint16, FheUint2, FheUint32, FheUint4, FheUint6, FheUint64, FheUint8,
+    FheUintId, IntegerId, KVStore,
 };

-use rayon::prelude::*;
+mod oprf;

-fn bench_fhe_type<FheType>(
+trait BenchWait {
+    fn wait_bench(&self);
+}
+
+impl<Id: FheUintId> BenchWait for FheUint<Id> {
+    fn wait_bench(&self) {
+        self.wait()
+    }
+}
+
+impl BenchWait for FheBool {
+    fn wait_bench(&self) {
+        self.wait()
+    }
+}
+
+impl<T1: FheWait, T2> BenchWait for (T1, T2) {
+    fn wait_bench(&self) {
+        self.0.wait()
+    }
+}
+
+fn bench_fhe_type_unary_op<FheType, F, R>(
    c: &mut Criterion,
    client_key: &ClientKey,
    type_name: &str,
    bit_size: usize,
+    display_name: &str,
+    func_name: &str,
+    func: F,
 ) where
+    F: Fn(&FheType) -> R,
+    R: BenchWait,
+    FheType: FheEncrypt<u128, ClientKey>,
+    FheType: FheWait,
+{
+    let mut bench_group = c.benchmark_group(type_name);
+    let mut bench_prefix = "hlapi".to_string();
+    if cfg!(feature = "gpu") {
+        bench_prefix = format!("{}::cuda", bench_prefix);
+    } else if cfg!(feature = "hpu") {
+        bench_prefix = format!("{}::hpu", bench_prefix);
+    }
+
+    bench_prefix = format!("{}::ops", bench_prefix);
+
+    let mut rng = thread_rng();
+
+    let param = client_key.computation_parameters();
+    let param_name = param.name();
+    let bit_size = bit_size as u32;
+
+    let write_record = |bench_id: String, display_name| {
+        write_to_json::<u64, _>(
+            &bench_id,
+            param,
+            &param_name,
+            display_name,
+            &OperatorType::Atomic,
+            bit_size,
+            vec![],
+        );
+    };
+
+    let lhs = FheType::encrypt(rng.gen(), client_key);
+
+    let bench_id = format!("{bench_prefix}::{func_name}::{param_name}::{type_name}");
+
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let res = func(&lhs);
+            res.wait_bench();
+            black_box(res)
+        })
+    });
+    write_record(bench_id, display_name);
+}
+
+fn bench_fhe_type_binary_op<FheType, F, R>(
+    c: &mut Criterion,
+    client_key: &ClientKey,
+    type_name: &str,
+    bit_size: usize,
+    display_name: &str,
+    func_name: &str,
+    func: F,
+) where
+    F: Fn(&FheType, &FheType) -> R,
+    R: BenchWait,
    FheType: FheEncrypt<u128, ClientKey>,
    FheType: FheWait,
-    for<'a> &'a FheType: Add<&'a FheType, Output = FheType>
-        + Sub<&'a FheType, Output = FheType>
-        + Mul<&'a FheType, Output = FheType>
-        + BitAnd<&'a FheType, Output = FheType>
-        + BitOr<&'a FheType, Output = FheType>
-        + BitXor<&'a FheType, Output = FheType>
-        + Shl<&'a FheType, Output = FheType>
-        + Shr<&'a FheType, Output = FheType>
-        + RotateLeft<&'a FheType, Output = FheType>
-        + RotateRight<&'a FheType, Output = FheType>
-        + OverflowingAdd<&'a FheType, Output = FheType>
-        + OverflowingSub<&'a FheType, Output = FheType>,
-    for<'a> FheType: FheMin<&'a FheType, Output = FheType> + FheMax<&'a FheType, Output = FheType>,
 {
    let mut bench_group = c.benchmark_group(type_name);
    let mut bench_prefix = "hlapi".to_string();
@@ -71,170 +143,221 @@ fn bench_fhe_type<FheType>(
    let lhs = FheType::encrypt(rng.gen(), client_key);
    let rhs = FheType::encrypt(rng.gen(), client_key);

-    let mut bench_id;
+    let bench_id = format!("{bench_prefix}::{func_name}::{param_name}::{type_name}");

-    bench_id = format!("{bench_prefix}::add::{param_name}::{type_name}");
    bench_group.bench_function(&bench_id, |b| {
        b.iter(|| {
-            let res = &lhs + &rhs;
-            res.wait();
+            let res = func(&lhs, &rhs);
+            res.wait_bench();
            black_box(res)
        })
    });
-    write_record(bench_id, "add");
-
-    bench_id = format!("{bench_prefix}::overflowing_add::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let (res, flag) = lhs.overflowing_add(&rhs);
-            res.wait();
-            black_box((res, flag))
-        })
-    });
-    write_record(bench_id, "overflowing_add");
-
-    bench_id = format!("{bench_prefix}::overflowing_sub::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let (res, flag) = lhs.overflowing_sub(&rhs);
-            res.wait();
-            black_box((res, flag))
-        })
-    });
-    write_record(bench_id, "overflowing_sub");
-
-    bench_id = format!("{bench_prefix}::sub::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs - &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "sub");
-
-    bench_id = format!("{bench_prefix}::mul::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs * &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "mul");
-
-    bench_id = format!("{bench_prefix}::bitand::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs & &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "bitand");
-
-    bench_id = format!("{bench_prefix}::bitor::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs | &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "bitor");
-
-    bench_id = format!("{bench_prefix}::bitxor::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs ^ &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "bitxor");
-
-    bench_id = format!("{bench_prefix}::left_shift::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs << &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "left_shift");
-
-    bench_id = format!("{bench_prefix}::right_shift::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = &lhs >> &rhs;
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "right_shift");
-
-    bench_id = format!("{bench_prefix}::left_rotate::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = (&lhs).rotate_left(&rhs);
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "left_rotate");
-
-    bench_id = format!("{bench_prefix}::right_rotate::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = (&lhs).rotate_right(&rhs);
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "right_rotate");
-
-    bench_id = format!("{bench_prefix}::min::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = lhs.min(&rhs);
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "min");
-
-    bench_id = format!("{bench_prefix}::max::{param_name}::{type_name}");
-    bench_group.bench_function(&bench_id, |b| {
-        b.iter(|| {
-            let res = lhs.max(&rhs);
-            res.wait();
-            black_box(res)
-        })
-    });
-    write_record(bench_id, "max");
+    write_record(bench_id, display_name);
 }

-macro_rules! bench_type {
-    ($fhe_type:ident) => {
+fn bench_fhe_type_ternary_op<FheType, F, R>(
+    c: &mut Criterion,
+    client_key: &ClientKey,
+    type_name: &str,
+    bit_size: usize,
+    display_name: &str,
+    func_name: &str,
+    func: F,
+) where
+    F: Fn(&FheBool, &FheType, &FheType) -> R,
+    R: BenchWait,
+    FheType: FheEncrypt<u128, ClientKey>,
+    FheType: FheWait,
+{
+    let mut bench_group = c.benchmark_group(type_name);
+    let mut bench_prefix = "hlapi".to_string();
+    if cfg!(feature = "gpu") {
+        bench_prefix = format!("{}::cuda", bench_prefix);
+    } else if cfg!(feature = "hpu") {
+        bench_prefix = format!("{}::hpu", bench_prefix);
+    }
+
+    bench_prefix = format!("{}::ops", bench_prefix);
+
+    let mut rng = thread_rng();
+
+    let param = client_key.computation_parameters();
+    let param_name = param.name();
+    let bit_size = bit_size as u32;
+
+    let write_record = |bench_id: String, display_name| {
+        write_to_json::<u64, _>(
+            &bench_id,
+            param,
+            &param_name,
+            display_name,
+            &OperatorType::Atomic,
+            bit_size,
+            vec![],
+        );
+    };
+
+    let bool = FheBool::encrypt(rng.gen(), client_key);
+    let lhs = FheType::encrypt(rng.gen(), client_key);
+    let rhs = FheType::encrypt(rng.gen(), client_key);
+
+    let bench_id = format!("{bench_prefix}::{func_name}::{param_name}::{type_name}");
+
+    bench_group.bench_function(&bench_id, |b| {
+        b.iter(|| {
+            let res = func(&bool, &lhs, &rhs);
+            res.wait_bench();
+            black_box(res)
+        })
+    });
+    write_record(bench_id, display_name);
+}
+
+macro_rules! bench_type_binary_op (
+    (type_name: $fhe_type:ident, display_name: $display_name:literal, operation: $op:ident) => {
        ::paste::paste! {
-            fn [<bench_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
-                bench_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
+            fn [<bench_ $fhe_type:snake _ $op>](c: &mut Criterion, cks: &ClientKey) {
+                bench_fhe_type_binary_op::<$fhe_type, _, _>(
+                    c,
+                    cks,
+                    stringify!($fhe_type),
+                    $fhe_type::num_bits(),
+                    $display_name,
+                    stringify!($op),
+                    |lhs, rhs| lhs.$op(rhs)
+                );
            }
        }
    };
+);
+
+macro_rules! bench_type_unary_op (
+    (type_name: $fhe_type:ident, display_name: $display_name:literal, operation: $op:ident) => {
+        ::paste::paste! {
+            fn [<bench_ $fhe_type:snake _ $op>](c: &mut Criterion, cks: &ClientKey) {
+                bench_fhe_type_unary_op::<$fhe_type, _, _>(
+                    c,
+                    cks,
+                    stringify!($fhe_type),
+                    $fhe_type::num_bits(),
+                    $display_name,
+                    stringify!($op),
+                    |lhs| lhs.$op()
+                );
+            }
+        }
+    };
+);
+
+macro_rules! bench_type_ternary_op (
+    (type_name: $fhe_type:ident, display_name: $display_name:literal, operation: $op:ident) => {
+        ::paste::paste! {
+            fn [<bench_ $fhe_type:snake _ $op>](c: &mut Criterion, cks: &ClientKey) {
+                bench_fhe_type_ternary_op::<$fhe_type, _, _>(
+                    c,
+                    cks,
+                    stringify!($fhe_type),
+                    $fhe_type::num_bits(),
+                    $display_name,
+                    stringify!($op),
+                    |cond, lhs, rhs| cond.$op(lhs, rhs)
+                );
+            }
+        }
+    };
+);
+
+macro_rules! generate_typed_benches {
+    ($fhe_type:ident) => {
+        // bench_type_binary_op!(type_name: $fhe_type, display_name: "sum", operation: sum);
+        // bench_type_unary_op!(type_name: $fhe_type, display_name: "bitnot", operation: bitnot);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "add", operation: add);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "bitand", operation: bitand);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "bitor", operation: bitor);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "bitxor", operation: bitxor);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "div", operation: div);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "div_rem", operation: div_rem);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "eq", operation: eq);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "ge", operation: ge);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "gt", operation: gt);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "le", operation: le);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "left_rotate", operation: rotate_left);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "left_shift", operation: shl);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "lt", operation: lt);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "max", operation: max);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "min", operation: min);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "mul", operation: mul);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "ne", operation: ne);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "overflowing_add", operation: overflowing_add);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "overflowing_sub", operation: overflowing_sub);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "rem", operation: rem);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "right_rotate", operation: rotate_right);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "right_shift", operation: shr);
+        bench_type_binary_op!(type_name: $fhe_type, display_name: "sub", operation: sub);
+        bench_type_ternary_op!(type_name: $fhe_type, display_name: "flip", operation: flip);
+        bench_type_ternary_op!(type_name: $fhe_type, display_name: "if_then_else", operation: if_then_else);
+        bench_type_unary_op!(type_name: $fhe_type, display_name: "leading_ones", operation: leading_ones);
+        bench_type_unary_op!(type_name: $fhe_type, display_name: "leading_zeros", operation: leading_zeros);
+        bench_type_unary_op!(type_name: $fhe_type, display_name: "neg", operation: neg);
+        bench_type_unary_op!(type_name: $fhe_type, display_name: "not", operation: not);
+        bench_type_unary_op!(type_name: $fhe_type, display_name: "trailing_ones", operation: trailing_ones);
+        bench_type_unary_op!(type_name: $fhe_type, display_name: "trailing_zeros", operation: trailing_zeros);
+    };
 }

-bench_type!(FheUint2);
-bench_type!(FheUint4);
-bench_type!(FheUint6);
-bench_type!(FheUint8);
-bench_type!(FheUint10);
-bench_type!(FheUint12);
-bench_type!(FheUint14);
-bench_type!(FheUint16);
-bench_type!(FheUint32);
-bench_type!(FheUint64);
-bench_type!(FheUint128);
+// Generate benches for all FheUint types
+generate_typed_benches!(FheUint2);
+generate_typed_benches!(FheUint4);
+generate_typed_benches!(FheUint6);
+generate_typed_benches!(FheUint8);
+generate_typed_benches!(FheUint10);
+generate_typed_benches!(FheUint12);
+generate_typed_benches!(FheUint14);
+generate_typed_benches!(FheUint16);
+generate_typed_benches!(FheUint32);
+generate_typed_benches!(FheUint64);
+generate_typed_benches!(FheUint128);
+
+macro_rules! run_benches {
+    ($c:expr, $cks:expr, $($fhe_type:ident),+ $(,)?) => {
+        $(
+            ::paste::paste! {
+                [<bench_ $fhe_type:snake _add>]($c, $cks);
+                [<bench_ $fhe_type:snake _bitand>]($c, $cks);
+                [<bench_ $fhe_type:snake _bitor>]($c, $cks);
+                [<bench_ $fhe_type:snake _bitxor>]($c, $cks);
+                [<bench_ $fhe_type:snake _div>]($c, $cks);
+                [<bench_ $fhe_type:snake _div_rem>]($c, $cks);
+                [<bench_ $fhe_type:snake _eq>]($c, $cks);
+                [<bench_ $fhe_type:snake _flip>]($c, $cks);
+                [<bench_ $fhe_type:snake _ge>]($c, $cks);
+                [<bench_ $fhe_type:snake _gt>]($c, $cks);
+                [<bench_ $fhe_type:snake _if_then_else>]($c, $cks);
+                [<bench_ $fhe_type:snake _le>]($c, $cks);
+                [<bench_ $fhe_type:snake _leading_ones>]($c, $cks);
+                [<bench_ $fhe_type:snake _leading_zeros>]($c, $cks);
+                [<bench_ $fhe_type:snake _lt>]($c, $cks);
+                [<bench_ $fhe_type:snake _max>]($c, $cks);
+                [<bench_ $fhe_type:snake _min>]($c, $cks);
+                [<bench_ $fhe_type:snake _mul>]($c, $cks);
+                [<bench_ $fhe_type:snake _ne>]($c, $cks);
+                [<bench_ $fhe_type:snake _neg>]($c, $cks);
+                [<bench_ $fhe_type:snake _not>]($c, $cks);
+                [<bench_ $fhe_type:snake _not>]($c, $cks);
+                [<bench_ $fhe_type:snake _overflowing_add>]($c, $cks);
+                [<bench_ $fhe_type:snake _overflowing_sub>]($c, $cks);
+                [<bench_ $fhe_type:snake _rem>]($c, $cks);
+                [<bench_ $fhe_type:snake _rotate_left>]($c, $cks);
+                [<bench_ $fhe_type:snake _rotate_right>]($c, $cks);
+                [<bench_ $fhe_type:snake _shl>]($c, $cks);
+                [<bench_ $fhe_type:snake _shr>]($c, $cks);
+                [<bench_ $fhe_type:snake _sub>]($c, $cks);
+                [<bench_ $fhe_type:snake _trailing_ones>]($c, $cks);
+                [<bench_ $fhe_type:snake _trailing_zeros>]($c, $cks);
+            }
+        )+
+    };
+}

 trait TypeDisplay {
    fn fmt(f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -444,7 +567,7 @@ fn main() {

    match env_config.bit_sizes_set {
        BitSizesSet::Fast => {
-            bench_fhe_uint64(&mut c, &cks);
+            run_benches!(&mut c, &cks, FheUint64);

            // KVStore Benches
            if benched_device == tfhe::Device::Cpu {
@@ -452,17 +575,11 @@ fn main() {
            }
        }
        _ => {
-            bench_fhe_uint2(&mut c, &cks);
-            bench_fhe_uint4(&mut c, &cks);
-            bench_fhe_uint6(&mut c, &cks);
-            bench_fhe_uint8(&mut c, &cks);
-            bench_fhe_uint10(&mut c, &cks);
-            bench_fhe_uint12(&mut c, &cks);
-            bench_fhe_uint14(&mut c, &cks);
-            bench_fhe_uint16(&mut c, &cks);
-            bench_fhe_uint32(&mut c, &cks);
-            bench_fhe_uint64(&mut c, &cks);
-            bench_fhe_uint128(&mut c, &cks);
+            // Call all benchmarks for all types
+            run_benches!(
+                &mut c, &cks, FheUint2, FheUint4, FheUint6, FheUint8, FheUint10, FheUint12,
+                FheUint14, FheUint16, FheUint32, FheUint64, FheUint128
+            );

            // KVStore Benches
            if benched_device == tfhe::Device::Cpu {
@@ -481,5 +598,8 @@ fn main() {
        }
    }

+    #[cfg(not(feature = "hpu"))]
+    oprf_any_range2();
+
    c.final_summary();
 }
--- a/tfhe-benchmark/benches/high_level_api/oprf.rs
+++ b/tfhe-benchmark/benches/high_level_api/oprf.rs
@@ -0,0 +1,44 @@
+use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+use criterion::{black_box, criterion_group, Criterion};
+use std::num::NonZeroU64;
+use tfhe::{set_server_key, ClientKey, ConfigBuilder, FheUint64, RangeForRandom, Seed, ServerKey};
+
+pub fn oprf_any_range(c: &mut Criterion) {
+    let bench_name = "hlapi::oprf_any_range";
+
+    let mut bench_group = c.benchmark_group(bench_name);
+    bench_group
+        .sample_size(15)
+        .measurement_time(std::time::Duration::from_secs(30));
+
+    let param = BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
+
+    let config = ConfigBuilder::with_custom_parameters(param).build();
+    let cks = ClientKey::generate(config);
+    let sks = ServerKey::new(&cks);
+
+    rayon::broadcast(|_| set_server_key(sks.clone()));
+    set_server_key(sks);
+
+    for excluded_upper_bound in [3, 52] {
+        let range = RangeForRandom::new_from_excluded_upper_bound(
+            NonZeroU64::new(excluded_upper_bound).unwrap(),
+        );
+
+        let bench_id_oprf = format!("{bench_name}::bound_{excluded_upper_bound}");
+
+        bench_group.bench_function(&bench_id_oprf, |b| {
+            b.iter(|| {
+                _ = black_box(FheUint64::generate_oblivious_pseudo_random_custom_range(
+                    Seed(0),
+                    &range,
+                    None,
+                ));
+            })
+        });
+    }
+
+    bench_group.finish()
+}
+
+criterion_group!(oprf_any_range2, oprf_any_range);
--- a/tfhe-benchmark/benches/integer/bench.rs
+++ b/tfhe-benchmark/benches/integer/bench.rs
@@ -2809,6 +2809,7 @@ mod cuda {
    criterion_group!(
        default_cuda_dedup_ops,
        cuda_add,
+        cuda_neg,
        cuda_mul,
        cuda_div_rem,
        cuda_bitand,
--- a/tfhe-benchmark/src/params.rs
+++ b/tfhe-benchmark/src/params.rs
@@ -629,7 +629,9 @@ mod integer_params {
                // operations.
                #[cfg(feature = "hpu")]
                let params = vec![BENCH_HPU_PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M128.into()];
-                #[cfg(not(feature = "hpu"))]
+                #[cfg(feature = "gpu")]
+                let params = vec![BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS.into()];
+                #[cfg(not(any(feature = "gpu", feature = "hpu")))]
                let params = vec![BENCH_PARAM_MESSAGE_2_CARRY_2_KS32_PBS.into()];

                let params_and_bit_sizes = iproduct!(params, env_config.bit_sizes());
--- a/tfhe/Cargo.toml
+++ b/tfhe/Cargo.toml
@@ -27,6 +27,7 @@ rand_distr = "0.4.3"
 criterion = "0.5.1"
 doc-comment = "0.3.3"
 serde_json = "1.0.94"
+num-bigint = "0.4.6"
 # clap has to be pinned as its minimum supported rust version
 # changes often between minor releases, which breaks our CI
 clap = { version = "=4.5.30", features = ["derive"] }
--- a/tfhe/docs/fhe-computation/advanced-features/encrypted-prf.md
+++ b/tfhe/docs/fhe-computation/advanced-features/encrypted-prf.md
@@ -2,14 +2,30 @@

 This document explains the mechanism and steps to generate an oblivious encrypted random value using only server keys.

-The goal is to give to the server the possibility to generate a random value, which will be obtained in an encrypted format and will remain unknown to the server. The implementation is based on [this article](https://eprint.iacr.org/2024/665).
+The goal is to give to the server the possibility to generate a random value, which will be obtained in an encrypted format and will remain unknown to the server.

-This is possible through two methods on `FheUint` and `FheInt`: 
+The main method for this is `FheUint::generate_oblivious_pseudo_random_custom_range` which returns an integer in the given range.
+Currently the range can only be in the form `[0, excluded_upper_bound[` with any `excluded_upper_bound` in `[1, 2^64[`
+It follows a distribution close to the uniform.
+
+This function guarantees the norm-1 distance (defined as ∆(P,Q) := 1/2 Sum[ω∈Ω] |P(ω) − Q(ω)|)
+between the actual distribution and the target uniform distribution will be below the `max_distance` argument (which must be in ]0, 1[).
+The higher the distance, the more dissimilar the actual distribution is from the target uniform distribution.
+
+The default value for `max_distance` is `2^-128` if `None` is provided.
+
+Higher values allow better performance but must be considered carefully in the context of their target application as it may have serious unintended consequences.
+
+If the range is a power of 2, the distribution is uniform (for any `max_distance`) and the cost is smaller.
+
+
+For powers of 2 specifically there are two methods on `FheUint` and `FheInt` (based on [this article](https://eprint.iacr.org/2024/665)): 
 - `generate_oblivious_pseudo_random` which return an integer taken uniformly in the full integer range (`[0; 2^N[` for a `FheUintN` and `[-2^(N-1); 2^(N-1)[` for a `FheIntN`).
 - `generate_oblivious_pseudo_random_bounded` which return an integer taken uniformly in `[0; 2^random_bits_count[`. For a `FheUintN`, we must have  `random_bits_count <= N`. For a `FheIntN`, we must have  `random_bits_count <= N - 1`.

-Both methods functions take a seed `Seed` as input, which could be any `u128` value.
-They both rely on the use of the usual server key.
+
+These method functions take a seed `Seed` as input, which could be any `u128` value.
+They rely on the use of the usual server key.
 The output is reproducible, i.e., the function is deterministic from the inputs: assuming the same hardware, seed and server key, this function outputs the same random encrypted value.


@@ -18,7 +34,8 @@ Here is an example of the usage:

 ```rust
 use tfhe::prelude::FheDecrypt;
-use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheUint8, FheInt8, Seed};
+use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheUint8, FheInt8, RangeForRandom, Seed};
+use std::num::NonZeroU64;

 pub fn main() {
    let config = ConfigBuilder::default().build();
@@ -26,23 +43,30 @@ pub fn main() {

    set_server_key(server_key);

-    let random_bits_count = 3;
-
-    let ct_res = FheUint8::generate_oblivious_pseudo_random(Seed(0));
+    let excluded_upper_bound = NonZeroU64::new(3).unwrap();
+    let range = RangeForRandom::new_from_excluded_upper_bound(excluded_upper_bound);

+    // in [0, excluded_upper_bound[ = {0, 1, 2}
+    let ct_res = FheUint8::generate_oblivious_pseudo_random_custom_range(Seed(0), &range, None);
    let dec_result: u8 = ct_res.decrypt(&client_key);

-    let ct_res = FheUint8::generate_oblivious_pseudo_random_bounded(Seed(0), random_bits_count);
+    let random_bits_count = 3;

+    // in [0, 2^8[
+    let ct_res = FheUint8::generate_oblivious_pseudo_random(Seed(0));
+    let dec_result: u8 = ct_res.decrypt(&client_key);
+
+    // in [0, 2^random_bits_count[ = [0, 8[
+    let ct_res = FheUint8::generate_oblivious_pseudo_random_bounded(Seed(0), random_bits_count);
    let dec_result: u8 = ct_res.decrypt(&client_key);
    assert!(dec_result < (1 << random_bits_count));

+    // in [-2^7, 2^7[
    let ct_res = FheInt8::generate_oblivious_pseudo_random(Seed(0));
-    
    let dec_result: i8 = ct_res.decrypt(&client_key);
    
+    // in [0, 2^random_bits_count[ = [0, 8[
    let ct_res = FheInt8::generate_oblivious_pseudo_random_bounded(Seed(0), random_bits_count);
-
    let dec_result: i8 = ct_res.decrypt(&client_key);
    assert!(dec_result < (1 << random_bits_count));
 }
--- a/tfhe/docs/integration/js-on-wasm-api.md
+++ b/tfhe/docs/integration/js-on-wasm-api.md
@@ -141,7 +141,7 @@ Some parameter sets lead to the FHE keys exceeding the 2GB memory limit of WASM,

 ### Setting up TFHE-rs JS on WASM API for Node.js programs.

-To build the JS on WASM bindings for **TFHE-rs**, install [`wasm-pack`](https://rustwasm.github.io/wasm-pack/) and the necessary [`rust toolchain`](https://rustup.rs/). Clone the **TFHE-rs** repository and build using the following commands (this will build using the default branch, you can check out a specific tag depending on your requirements):
+To build the JS on WASM bindings for **TFHE-rs**, install [`wasm-pack`](https://drager.github.io/wasm-pack/) and the necessary [`rust toolchain`](https://rustup.rs/). Clone the **TFHE-rs** repository and build using the following commands (this will build using the default branch, you can check out a specific tag depending on your requirements):

 ```shell
 $ git clone https://github.com/zama-ai/tfhe-rs.git
@@ -150,7 +150,7 @@ Cloning into 'tfhe-rs'...
 Resolving deltas: 100% (3866/3866), done.
 $ cd tfhe-rs
 $ cd tfhe
-$ rustup run wasm-pack build --release --target=nodejs --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api
+$ wasm-pack build --release --target=nodejs --features=boolean-client-js-wasm-api,shortint-client-js-wasm-api
 [INFO]: Compiling to Wasm...
 ...
 [INFO]: :-) Your wasm pkg is ready to publish at ...
@@ -164,7 +164,7 @@ After the build, a new directory **pkg** is available in the `tfhe` directory.

 ```shell
 $ ls pkg
-LICENSE  index.html  package.json  tfhe.d.ts  tfhe.js  tfhe_bg.txt  tfhe_bg.wasm  tfhe_bg.wasm.d.ts
+LICENSE  README.md  package.json  tfhe.d.ts  tfhe.js  tfhe_bg.wasm  tfhe_bg.wasm.d.ts
 $
 ```

--- a/tfhe/src/core_crypto/commons/math/random/tests.rs
+++ b/tfhe/src/core_crypto/commons/math/random/tests.rs
@@ -540,10 +540,12 @@ pub fn sup_diff(cumulative_bins: &[u64], theoretical_cdf: &[f64]) -> f64 {
        .iter()
        .copied()
        .zip_eq(theoretical_cdf.iter().copied())
-        .map(|(x, theoretical_cdf)| {
+        .enumerate()
+        .map(|(i, (x, theoretical_cdf))| {
            let empirical_cdf = x as f64 / number_of_samples as f64;

-            if theoretical_cdf == 1.0 {
+            if i == cumulative_bins.len() - 1 {
+                assert_eq!(theoretical_cdf, 1.0);
                assert_eq!(empirical_cdf, 1.0);
            }

--- a/tfhe/src/high_level_api/booleans/base.rs
+++ b/tfhe/src/high_level_api/booleans/base.rs
@@ -19,6 +19,7 @@ use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
 use crate::integer::prelude::*;
 use crate::integer::BooleanBlock;
 use crate::named::Named;
+use crate::prelude::FheWait;
 use crate::shortint::ciphertext::NotTrivialCiphertextError;
 use crate::shortint::parameters::CiphertextConformanceParams;
 use crate::shortint::AtomicPatternParameters;
@@ -73,6 +74,12 @@ impl Named for FheBool {
    const NAME: &'static str = "high_level_api::FheBool";
 }

+impl FheWait for FheBool {
+    fn wait(&self) {
+        self.ciphertext.wait()
+    }
+}
+
 #[derive(Copy, Clone)]
 pub struct FheBoolConformanceParams(pub(crate) CiphertextConformanceParams);

--- a/tfhe/src/high_level_api/booleans/inner.rs
+++ b/tfhe/src/high_level_api/booleans/inner.rs
@@ -139,6 +139,16 @@ impl InnerBoolean {
        }
    }

+    pub(crate) fn wait(&self) {
+        match self {
+            Self::Cpu(_) => {}
+            #[cfg(feature = "gpu")]
+            Self::Cuda(_) => {}
+            #[cfg(feature = "hpu")]
+            Self::Hpu(ct) => ct.wait(),
+        }
+    }
+
    /// Returns the inner cpu ciphertext if self is on the CPU, otherwise, returns a copy
    /// that is on the CPU
    pub(crate) fn on_cpu(&self) -> MaybeCloned<'_, BooleanBlock> {
--- a/tfhe/src/high_level_api/integers/oprf.rs
+++ b/tfhe/src/high_level_api/integers/oprf.rs
@@ -4,7 +4,9 @@ use crate::high_level_api::keys::InternalServerKey;
 use crate::high_level_api::re_randomization::ReRandomizationMetadata;
 #[cfg(feature = "gpu")]
 use crate::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
+use crate::shortint::MessageModulus;
 use crate::{FheInt, Seed};
+use std::num::NonZeroU64;

 impl<Id: FheUintId> FheUint<Id> {
    /// Generates an encrypted unsigned integer
@@ -92,7 +94,7 @@ impl<Id: FheUintId> FheUint<Id> {
            }
        })
    }
-    /// Generates an encrypted `num_block` blocks unsigned integer
+    /// Generates an encrypted unsigned integer
    /// taken uniformly in `[0, 2^random_bits_count[` using the given seed.
    /// The encrypted value is oblivious to the server.
    /// It can be useful to make server random generation deterministic.
@@ -150,6 +152,103 @@ impl<Id: FheUintId> FheUint<Id> {
            }
        })
    }
+
+    /// Generates an encrypted unsigned integer
+    /// taken almost uniformly in the given range using the given seed.
+    /// Currently the range can only be in the form `[0, excluded_upper_bound[`
+    /// with any `excluded_upper_bound` in `[1, 2^64[`.
+    ///
+    /// The encrypted value is oblivious to the server.
+    /// It can be useful to make server random generation deterministic.
+    ///
+    /// This function guarantees the the norm-1 distance
+    /// (defined as ∆(P,Q) := 1/2 Sum[ω∈Ω] |P(ω) − Q(ω)|)
+    /// between the actual distribution and the target uniform distribution
+    /// will be below the `max_distance` argument (which must be in ]0, 1[).
+    /// The higher the distance, the more dissimilar the actual distribution is
+    /// from the target uniform distribution.
+    ///
+    /// The default value for `max_distance` is `2^-128` if `None` is provided.
+    ///
+    /// Higher values allow better performance but must be considered carefully in the context of
+    /// their target application as it may have serious unintended consequences.
+    ///
+    /// If the range is a power of 2, the distribution is uniform (for any `max_distance`) and
+    /// the cost is smaller.
+    ///
+    /// ```rust
+    /// use std::num::NonZeroU64;
+    /// use tfhe::prelude::FheDecrypt;
+    /// use tfhe::{generate_keys, set_server_key, ConfigBuilder, FheUint8, RangeForRandom, Seed};
+    ///
+    /// let config = ConfigBuilder::default().build();
+    /// let (client_key, server_key) = generate_keys(config);
+    ///
+    /// set_server_key(server_key);
+    ///
+    /// let excluded_upper_bound = NonZeroU64::new(3).unwrap();
+    ///
+    /// let range = RangeForRandom::new_from_excluded_upper_bound(excluded_upper_bound);
+    ///
+    /// let ct_res = FheUint8::generate_oblivious_pseudo_random_custom_range(Seed(0), &range, None);
+    ///
+    /// let dec_result: u16 = ct_res.decrypt(&client_key);
+    /// assert!(dec_result < excluded_upper_bound.get() as u16);
+    /// ```
+    pub fn generate_oblivious_pseudo_random_custom_range(
+        seed: Seed,
+        range: &RangeForRandom,
+        max_distance: Option<f64>,
+    ) -> Self {
+        let excluded_upper_bound = range.excluded_upper_bound;
+
+        if excluded_upper_bound.is_power_of_two() {
+            let random_bits_count = excluded_upper_bound.ilog2() as u64;
+
+            Self::generate_oblivious_pseudo_random_bounded(seed, random_bits_count)
+        } else {
+            let max_distance = max_distance.unwrap_or_else(|| 2_f64.powi(-128));
+
+            assert!(
+                0_f64 < max_distance && max_distance < 1_f64,
+                "max_distance (={max_distance}) should be in ]0, 1["
+            );
+
+            global_state::with_internal_keys(|key| match key {
+                InternalServerKey::Cpu(key) => {
+                    let message_modulus = key.message_modulus();
+
+                    let num_input_random_bits = num_input_random_bits_for_max_distance(
+                        excluded_upper_bound,
+                        max_distance,
+                        message_modulus,
+                    );
+
+                    let num_blocks_output = Id::num_blocks(key.message_modulus()) as u64;
+
+                    let ct = key
+                        .pbs_key()
+                        .par_generate_oblivious_pseudo_random_unsigned_custom_range(
+                            seed,
+                            num_input_random_bits,
+                            excluded_upper_bound,
+                            num_blocks_output,
+                        );
+
+                    Self::new(ct, key.tag.clone(), ReRandomizationMetadata::default())
+                }
+                #[cfg(feature = "gpu")]
+                InternalServerKey::Cuda(_cuda_key) => {
+                    panic!("Gpu does not support this operation yet.")
+                }
+                #[cfg(feature = "hpu")]
+                InternalServerKey::Hpu(_device) => {
+                    panic!("Hpu does not support this operation yet.")
+                }
+            })
+        }
+    }
+
    #[cfg(feature = "gpu")]
    /// Returns the amount of memory required to execute generate_oblivious_pseudo_random_bounded
    ///
@@ -273,7 +372,7 @@ impl<Id: FheIntId> FheInt<Id> {
            }
        })
    }
-    /// Generates an encrypted `num_block` blocks signed integer
+    /// Generates an encrypted signed integer
    /// taken uniformly in `[0, 2^random_bits_count[` using the given seed.
    /// The encrypted value is oblivious to the server.
    /// It can be useful to make server random generation deterministic.
@@ -367,10 +466,350 @@ impl<Id: FheIntId> FheInt<Id> {
    }
 }

+pub struct RangeForRandom {
+    excluded_upper_bound: NonZeroU64,
+}
+
+impl RangeForRandom {
+    pub fn new_from_excluded_upper_bound(excluded_upper_bound: NonZeroU64) -> Self {
+        Self {
+            excluded_upper_bound,
+        }
+    }
+}
+
+fn num_input_random_bits_for_max_distance(
+    excluded_upper_bound: NonZeroU64,
+    max_distance: f64,
+    message_modulus: MessageModulus,
+) -> u64 {
+    assert!(message_modulus.0.is_power_of_two());
+    let log_message_modulus = message_modulus.0.ilog2() as u64;
+
+    let mut random_block_count = 1;
+
+    let random_block_count = loop {
+        let random_bit_count = random_block_count * log_message_modulus;
+
+        let distance = distance(excluded_upper_bound.get(), random_bit_count);
+
+        if distance < max_distance {
+            break random_block_count;
+        }
+
+        random_block_count += 1;
+    };
+
+    random_block_count * log_message_modulus
+}
+
+fn distance(excluded_upper_bound: u64, random_bit_count: u64) -> f64 {
+    let remainder = mod_pow_2(random_bit_count, excluded_upper_bound);
+
+    remainder as f64 * (excluded_upper_bound - remainder) as f64
+        / (2_f64.powi(random_bit_count as i32) * excluded_upper_bound as f64)
+}
+
+// Computes 2^exponent % modulus
+fn mod_pow_2(exponent: u64, modulus: u64) -> u64 {
+    assert_ne!(modulus, 0);
+
+    if modulus == 1 {
+        return 0;
+    }
+
+    let mut result: u128 = 1;
+    let mut base: u128 = 2; // We are calculating 2^i
+
+    // We cast exponent to u128 to match the loop, though u64 is fine
+    let mut exp = exponent;
+    let mod_val = modulus as u128;
+
+    while exp > 0 {
+        // If exponent is odd, multiply result with base
+        if exp % 2 == 1 {
+            result = (result * base) % mod_val;
+        }
+
+        // Square the base
+        base = (base * base) % mod_val;
+
+        // Divide exponent by 2
+        exp /= 2;
+    }
+
+    result as u64
+}
+
+#[cfg(test)]
+mod test {
+
+    use super::*;
+    use crate::integer::server_key::radix_parallel::tests_unsigned::test_oprf::{
+        oprf_density_function, p_value_upper_bound_oprf_almost_uniformity_from_values,
+        probability_density_function_from_density,
+    };
+    use crate::prelude::FheDecrypt;
+    use crate::shortint::oprf::test::test_uniformity;
+    use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M128;
+    use crate::{generate_keys, set_server_key, ClientKey, ConfigBuilder, FheUint8, Seed};
+    use num_bigint::BigUint;
+    use rand::{thread_rng, Rng};
+    use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+    // Helper: The "Oracle" implementation using BigInt
+    // This is slow but mathematically guaranteed to be correct.
+    fn oracle_mod_pow_2(exponent: u64, modulus: u64) -> u64 {
+        assert_ne!(modulus, 0);
+
+        if modulus == 1 {
+            return 0;
+        }
+
+        let base = BigUint::from(2u32);
+        let exp = BigUint::from(exponent);
+        let modu = BigUint::from(modulus);
+
+        let res = base.modpow(&exp, &modu);
+        res.iter_u64_digits().next().unwrap_or(0)
+    }
+
+    #[test]
+    fn test_edge_cases() {
+        // 2^0 % 10 = 1
+        assert_eq!(mod_pow_2(0, 10), 1, "Failed exponent 0");
+
+        // 2^10 % 1 = 0
+        assert_eq!(mod_pow_2(10, 1), 0, "Failed modulus 1");
+
+        // 2^1 % 10 = 2
+        assert_eq!(mod_pow_2(1, 10), 2, "Failed exponent 1");
+
+        // 2^3 % 5 = 8 % 5 = 3
+        assert_eq!(mod_pow_2(3, 5), 3, "Failed small calc");
+    }
+
+    #[test]
+    fn test_boundaries_and_overflow() {
+        assert_eq!(mod_pow_2(2, u64::MAX), 4);
+
+        assert_eq!(mod_pow_2(u64::MAX, 3), 2);
+
+        assert_eq!(mod_pow_2(5, 32), 0);
+    }
+
+    #[test]
+    fn test_against_oracle() {
+        let mut rng = thread_rng();
+        for _ in 0..1_000_000 {
+            let exp: u64 = rng.gen();
+            let mod_val: u64 = rng.gen();
+
+            let mod_val = if mod_val == 0 { 1 } else { mod_val };
+
+            let expected = oracle_mod_pow_2(exp, mod_val);
+            let actual = mod_pow_2(exp, mod_val);
+
+            assert_eq!(
+                actual, expected,
+                "Mismatch! 2^{exp} % {mod_val} => Ours: {actual}, Oracle: {expected}",
+            );
+        }
+    }
+
+    #[test]
+    fn test_distance_with_uniform() {
+        for excluded_upper_bound in 1..20 {
+            for num_input_random_bits in 0..20 {
+                let density = oprf_density_function(excluded_upper_bound, num_input_random_bits);
+
+                let theoretical_pdf = probability_density_function_from_density(&density);
+
+                let p_uniform = 1. / excluded_upper_bound as f64;
+
+                let actual_distance: f64 = 1. / 2.
+                    * theoretical_pdf
+                        .iter()
+                        .map(|p| (*p - p_uniform).abs())
+                        .sum::<f64>();
+
+                let theoretical_distance = distance(excluded_upper_bound, num_input_random_bits);
+
+                assert!(
+                    (theoretical_distance - actual_distance).abs()
+                        <= theoretical_distance / 1_000_000.,
+                    "{theoretical_distance} != {actual_distance}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_uniformity_scalar_mul_shift() {
+        let max_distance = 2_f64.powi(-20);
+
+        let message_modulus = MessageModulus(4);
+
+        let excluded_upper_bound = 3;
+
+        let num_input_random_bits = num_input_random_bits_for_max_distance(
+            NonZeroU64::new(excluded_upper_bound).unwrap(),
+            max_distance,
+            message_modulus,
+        );
+
+        let sample_count: usize = 10_000_000;
+
+        let p_value_limit: f64 = 0.001;
+
+        // The distribution is not exactly uniform
+        // This check ensures than with the given low max_distance,
+        // the distribution is indistinguishable from the uniform with at the given sample count
+        test_uniformity(sample_count, p_value_limit, excluded_upper_bound, |_seed| {
+            oprf_clear_equivalent(excluded_upper_bound, num_input_random_bits)
+        });
+    }
+
+    fn oprf_clear_equivalent(excluded_upper_bound: u64, num_input_random_bits: u64) -> u64 {
+        let random_input_upper_bound = 1 << num_input_random_bits;
+
+        let random_input = thread_rng().gen_range(0..random_input_upper_bound);
+
+        (random_input * excluded_upper_bound) >> num_input_random_bits
+    }
+
+    #[test]
+    fn test_uniformity_generate_oblivious_pseudo_random_custom_range() {
+        let base_sample_count: usize = 10_000;
+
+        let p_value_limit: f64 = 0.001;
+
+        let params = PARAM_MESSAGE_2_CARRY_2_KS32_PBS_TUNIFORM_2M128;
+        let config = ConfigBuilder::with_custom_parameters(params).build();
+
+        let (cks, sks) = generate_keys(config);
+        rayon::broadcast(|_| set_server_key(sks.clone()));
+
+        let message_modulus = params.message_modulus;
+
+        // [0.7, 0.1] for `max_distance` chosen to have `num_input_random_bits` be [2, 4]
+        // for any of the listed `excluded_upper_bound`
+        for (expected_num_input_random_bits, max_distance, excluded_upper_bounds) in
+            [(2, 0.7, [3, 5, 6, 7]), (4, 0.1, [3, 5, 6, 7])]
+        {
+            for excluded_upper_bound in excluded_upper_bounds {
+                let sample_count = base_sample_count * excluded_upper_bound as usize;
+
+                let excluded_upper_bound = NonZeroU64::new(excluded_upper_bound).unwrap();
+
+                let num_input_random_bits = num_input_random_bits_for_max_distance(
+                    excluded_upper_bound,
+                    max_distance,
+                    message_modulus,
+                );
+
+                assert_eq!(num_input_random_bits, expected_num_input_random_bits);
+
+                test_uniformity_generate_oblivious_pseudo_random_custom_range2(
+                    sample_count,
+                    p_value_limit,
+                    message_modulus,
+                    &cks,
+                    excluded_upper_bound,
+                    max_distance,
+                );
+            }
+        }
+    }
+
+    fn test_uniformity_generate_oblivious_pseudo_random_custom_range2(
+        sample_count: usize,
+        p_value_limit: f64,
+        message_modulus: MessageModulus,
+        cks: &ClientKey,
+        excluded_upper_bound: NonZeroU64,
+        max_distance: f64,
+    ) {
+        let num_input_random_bits = num_input_random_bits_for_max_distance(
+            excluded_upper_bound,
+            max_distance,
+            message_modulus,
+        );
+
+        let range = RangeForRandom::new_from_excluded_upper_bound(excluded_upper_bound);
+
+        let real_values: Vec<u64> = (0..sample_count)
+            .into_par_iter()
+            .map(|_| {
+                let img = FheUint8::generate_oblivious_pseudo_random_custom_range(
+                    Seed(rand::thread_rng().gen::<u128>()),
+                    &range,
+                    Some(max_distance),
+                );
+
+                img.decrypt(cks)
+            })
+            .collect();
+
+        let excluded_upper_bound = excluded_upper_bound.get();
+
+        let uniform_values: Vec<u64> = (0..sample_count)
+            .into_par_iter()
+            .map(|_| thread_rng().gen_range(0..excluded_upper_bound))
+            .collect();
+
+        let clear_oprf_value_lower_num_input_random_bits = (0..sample_count)
+            .into_par_iter()
+            .map(|_| oprf_clear_equivalent(excluded_upper_bound, num_input_random_bits - 1))
+            .collect();
+
+        let clear_oprf_value_same_num_input_random_bits = (0..sample_count)
+            .into_par_iter()
+            .map(|_| oprf_clear_equivalent(excluded_upper_bound, num_input_random_bits))
+            .collect();
+
+        let clear_oprf_value_higher_num_input_random_bits = (0..sample_count)
+            .into_par_iter()
+            .map(|_| oprf_clear_equivalent(excluded_upper_bound, num_input_random_bits + 1))
+            .collect();
+
+        for (values, should_have_low_p_value) in [
+            (&real_values, false),
+            // to test that the same distribution passes
+            (&clear_oprf_value_same_num_input_random_bits, false),
+            // to test that other distribution don't pass
+            // (makes sure the test is statistically powerful)
+            (&uniform_values, true),
+            (&clear_oprf_value_lower_num_input_random_bits, true),
+            (&clear_oprf_value_higher_num_input_random_bits, true),
+        ] {
+            let p_value_upper_bound = p_value_upper_bound_oprf_almost_uniformity_from_values(
+                values,
+                num_input_random_bits,
+                excluded_upper_bound,
+            );
+
+            println!("p_value_upper_bound: {p_value_upper_bound}");
+
+            if should_have_low_p_value {
+                assert!(
+                    p_value_upper_bound < p_value_limit,
+                    "p_value_upper_bound (={p_value_upper_bound}) expected to be smaller than {p_value_limit}"
+                );
+            } else {
+                assert!(
+                    p_value_limit < p_value_upper_bound ,
+                    "p_value_upper_bound (={p_value_upper_bound}) expected to be bigger than {p_value_limit}"
+                );
+            }
+        }
+    }
+}
+
 #[cfg(test)]
 #[cfg(feature = "gpu")]
 #[allow(unused_imports)]
-mod test {
+mod test_gpu {
    use crate::prelude::*;
    use crate::{
        generate_keys, set_server_key, ConfigBuilder, FheInt128, FheUint32, FheUint64, GpuIndex,
--- a/tfhe/src/high_level_api/mod.rs
+++ b/tfhe/src/high_level_api/mod.rs
@@ -48,6 +48,7 @@ macro_rules! export_concrete_array_types {
 }

 pub use crate::core_crypto::commons::math::random::{Seed, XofSeed};
+pub use crate::high_level_api::integers::oprf::RangeForRandom;
 pub use crate::integer::server_key::MatchValues;
 use crate::{error, Error, Versionize};
 use backward_compatibility::compressed_ciphertext_list::SquashedNoiseCiphertextStateVersions;
--- a/tfhe/src/integer/gpu/list_compression/server_keys.rs
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -1,4 +1,5 @@
 use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::CudaStreams;
@@ -16,7 +17,8 @@ use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
 use crate::integer::gpu::server_key::CudaBootstrappingKey;
 use crate::integer::gpu::{
    cuda_backend_compress, cuda_backend_decompress, cuda_backend_get_compression_size_on_gpu,
-    cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu, PBSType,
+    cuda_backend_get_decompression_size_on_gpu, cuda_memcpy_async_gpu_to_gpu, extract_glwe_async,
+    PBSType,
 };
 use crate::prelude::CastInto;
 use crate::shortint::ciphertext::{
@@ -197,6 +199,30 @@ impl<T: UnsignedInteger> CudaPackedGlweCiphertextList<T> {
            meta: self.meta,
        }
    }
+    pub fn extract_glwe(
+        &self,
+        glwe_index: usize,
+        streams: &CudaStreams,
+    ) -> CudaGlweCiphertextList<T> {
+        let meta = self
+            .meta
+            .as_ref()
+            .expect("CudaPackedGlweCiphertextList meta must be set to extract GLWE");
+
+        let mut output_cuda_glwe_list = CudaGlweCiphertextList::new(
+            meta.glwe_dimension,
+            meta.polynomial_size,
+            GlweCiphertextCount(1),
+            meta.ciphertext_modulus,
+            streams,
+        );
+
+        unsafe {
+            extract_glwe_async(streams, &mut output_cuda_glwe_list, self, glwe_index as u32);
+        }
+        streams.synchronize();
+        output_cuda_glwe_list
+    }
 }

 impl<T: UnsignedInteger> Clone for CudaPackedGlweCiphertextList<T> {
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -7,6 +7,7 @@ pub mod server_key;
 #[cfg(feature = "zk-pok")]
 pub mod zk;

+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
 use crate::core_crypto::gpu::lwe_bootstrap_key::CudaModulusSwitchNoiseReductionConfiguration;
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::lwe_compact_ciphertext_list::CudaLweCompactCiphertextList;
@@ -10423,3 +10424,44 @@ pub unsafe fn unchecked_small_scalar_mul_integer_async(
        carry_modulus.0 as u32,
    );
 }
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn extract_glwe_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaGlweCiphertextList<T>,
+    glwe_list: &CudaPackedGlweCiphertextList<T>,
+    glwe_index: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_array_out.0.d_vec.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_list.data.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let packed_glwe_list_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_list);
+
+    if T::BITS == 128 {
+        cuda_integer_extract_glwe_128(
+            streams.ffi(),
+            glwe_array_out.0.d_vec.as_mut_c_ptr(0),
+            &raw const packed_glwe_list_ffi,
+            glwe_index,
+        );
+    } else if T::BITS == 64 {
+        cuda_integer_extract_glwe_64(
+            streams.ffi(),
+            glwe_array_out.0.d_vec.as_mut_c_ptr(0),
+            &raw const packed_glwe_list_ffi,
+            glwe_index,
+        );
+    } else {
+        panic!("Unsupported integer size for CUDA GLWE extraction");
+    }
+}
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_packingks_ms.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/br_dp_packingks_ms.rs
@@ -0,0 +1,757 @@
+use super::utils::noise_simulation::{CudaDynLwe, CudaSideResources};
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{GlweCiphertext, LweCiphertext};
+use crate::integer::compression_keys::CompressionPrivateKeys;
+use crate::integer::gpu::list_compression::server_keys::CudaCompressionKey;
+use crate::integer::gpu::server_key::radix::tests_noise_distribution::utils::noise_simulation::cuda_glwe_list_to_glwe_ciphertext;
+use crate::integer::gpu::server_key::radix::tests_unsigned::create_gpu_parameterized_test;
+use crate::integer::gpu::server_key::radix::CudaUnsignedRadixCiphertext;
+use crate::integer::gpu::CudaServerKey;
+use crate::integer::{ClientKey, CompressedServerKey, IntegerCiphertext};
+use crate::shortint::ciphertext::{Ciphertext, Degree, NoiseLevel};
+use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey;
+use crate::shortint::engine::ShortintEngine;
+use crate::shortint::parameters::test_params::TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128;
+use crate::shortint::parameters::{CompressionParameters, MetaParameters, Variance};
+use crate::shortint::server_key::tests::noise_distribution::br_dp_packingks_ms::br_dp_packing_ks_ms;
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{
+    NoiseSimulationGlwe, NoiseSimulationLwe, NoiseSimulationLweFourierBsk,
+    NoiseSimulationLwePackingKeyswitchKey, NoiseSimulationModulus,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::{
+    expected_pfail_for_precision, mean_and_variance_check, normality_check, pfail_check,
+    precision_with_padding, update_ap_params_msg_and_carry_moduli, DecryptionAndNoiseResult,
+    NoiseSample, PfailAndPrecision, PfailTestMeta, PfailTestResult,
+};
+use crate::shortint::server_key::tests::noise_distribution::{
+    should_run_short_pfail_tests_debug, should_use_single_key_debug,
+};
+use crate::shortint::{
+    AtomicPatternParameters, CarryModulus, MessageModulus, ShortintEncoding, ShortintParameterSet,
+};
+use crate::GpuIndex;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+pub const SAMPLES_PER_MSG_PACKING_KS_NOISE: usize = 1000;
+
+fn sanity_check_encrypt_br_dp_packing_ks_ms(meta_params: MetaParameters) {
+    let (params, comp_params) = (
+        meta_params.compute_parameters,
+        meta_params.compression_parameters.unwrap(),
+    );
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let private_compression_key = cks.new_compression_private_key(comp_params);
+    let (compressed_compression_key, _compressed_decompression_key) =
+        cks.new_compressed_compression_decompression_keys(&private_compression_key);
+    let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams);
+    let lwe_per_glwe = cuda_compression_key.lwe_per_glwe;
+    // The multiplication done in the compression is made to move the message up at the top of the
+    // carry space, multiplying by the carry modulus achieves that
+    let dp_scalar = params.carry_modulus().0;
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+    let storage_modulus_log = cuda_compression_key.storage_log_modulus;
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams);
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0)
+        .map(|_| {
+            cks.key
+                .encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0)
+        })
+        .collect();
+    let d_input_zeros: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), &streams);
+            CudaDynLwe::U64(d_ct_input)
+        })
+        .collect();
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(&streams, cuda_block_info))
+        .collect();
+
+    let (d_before_packing, _after_packing, d_after_ms) = br_dp_packing_ks_ms(
+        d_input_zeros,
+        &cuda_sks,
+        &d_accumulator,
+        dp_scalar,
+        &cuda_compression_key.packing_key_switching_key,
+        storage_modulus_log,
+        &mut cuda_side_resources,
+    );
+
+    let compression_inputs: Vec<_> = d_before_packing
+        .into_iter()
+        .map(|(_input, pbs_result, _dp_result)| {
+            let pbs_result_list_cpu = pbs_result.as_lwe_64().to_lwe_ciphertext_list(&streams);
+            let pbs_result_cpu = LweCiphertext::from_container(
+                pbs_result_list_cpu.clone().into_container(),
+                pbs_result_list_cpu.ciphertext_modulus(),
+            );
+            let cpu_ct = Ciphertext::new(
+                pbs_result_cpu,
+                Degree::new(params.message_modulus().0 - 1),
+                NoiseLevel::NOMINAL,
+                params.message_modulus(),
+                params.carry_modulus(),
+                params.atomic_pattern(),
+            );
+            let radix_ct = crate::integer::RadixCiphertext::from_blocks(vec![cpu_ct]);
+            let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&radix_ct, &streams);
+            d_ct.ciphertext
+        })
+        .collect();
+
+    let gpu_compressed =
+        cuda_compression_key.compress_ciphertexts_into_list(&compression_inputs, &streams);
+
+    let gpu_extracted = gpu_compressed.extract_glwe(0, &streams);
+    let extracted_list = gpu_extracted.to_glwe_ciphertext_list(&streams);
+    let extracted_glwe = GlweCiphertext::from_container(
+        extracted_list.clone().into_container(),
+        extracted_list.polynomial_size(),
+        extracted_list.ciphertext_modulus(),
+    );
+    let after_ms_list = d_after_ms.to_glwe_ciphertext_list(&streams);
+    let mut after_ms = GlweCiphertext::from_container(
+        after_ms_list.clone().into_container(),
+        after_ms_list.polynomial_size(),
+        after_ms_list.ciphertext_modulus(),
+    );
+    // Bodies that were not filled are discarded
+    after_ms.get_mut_body().as_mut()[lwe_per_glwe.0..].fill(0);
+
+    assert_eq!(after_ms.as_view(), extracted_glwe.as_view());
+}
+
+create_gpu_parameterized_test!(sanity_check_encrypt_br_dp_packing_ks_ms {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+#[allow(clippy::type_complexity, clippy::too_many_arguments)]
+fn encrypt_br_dp_packing_ks_ms_inner_helper_gpu(
+    params: AtomicPatternParameters,
+    comp_params: CompressionParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_compression_private_key: &CompressionPrivateKeys,
+    single_cuda_compression_key: &CudaCompressionKey,
+    msg: u64,
+    streams: &CudaStreams,
+) -> (
+    Vec<(
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+    )>,
+    Vec<DecryptionAndNoiseResult>,
+    Vec<DecryptionAndNoiseResult>,
+) {
+    let mut engine = ShortintEngine::new();
+    let thread_cks: crate::integer::ClientKey;
+    let thread_cuda_sks: CudaServerKey;
+    let thread_compression_private_key;
+    let thread_cuda_compression_key;
+    let (cks, cuda_sks, compression_private_key, cuda_compression_key) =
+        if should_use_single_key_debug() {
+            (
+                single_cks,
+                single_cuda_sks,
+                single_compression_private_key,
+                single_cuda_compression_key,
+            )
+        } else {
+            let block_params: ShortintParameterSet = params.into();
+            thread_cks = crate::integer::ClientKey::new(block_params);
+            let compressed_server_key =
+                CompressedServerKey::new_radix_compressed_server_key(&thread_cks);
+            thread_cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, streams);
+
+            thread_compression_private_key = thread_cks.new_compression_private_key(comp_params);
+            let (compressed_compression_key, _compressed_decompression_key) = thread_cks
+                .new_compressed_compression_decompression_keys(&thread_compression_private_key);
+            thread_cuda_compression_key = compressed_compression_key.decompress_to_cuda(streams);
+
+            (
+                &thread_cks,
+                &thread_cuda_sks,
+                &thread_compression_private_key,
+                &thread_cuda_compression_key,
+            )
+        };
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+    let lwe_per_glwe = cuda_compression_key.lwe_per_glwe;
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0)
+        .map(|_| {
+            cks.key.encrypt_noiseless_pbs_input_dyn_lwe_with_engine(
+                br_input_modulus_log,
+                msg,
+                &mut engine,
+            )
+        })
+        .collect();
+
+    let d_input_zeros: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), streams);
+            CudaDynLwe::U64(d_ct_input)
+        })
+        .collect();
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, streams);
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(streams, cuda_block_info))
+        .collect();
+
+    let dp_scalar = params.carry_modulus().0;
+    let storage_modulus_log = cuda_compression_key.storage_log_modulus;
+
+    let (d_before_packing, d_after_packing, d_after_ms) = br_dp_packing_ks_ms(
+        d_input_zeros,
+        cuda_sks,
+        &d_accumulator,
+        dp_scalar,
+        &cuda_compression_key.packing_key_switching_key,
+        storage_modulus_log,
+        &mut cuda_side_resources,
+    );
+
+    let compute_large_lwe_secret_key = cks.key.encryption_key();
+    let compression_glwe_secret_key = &compression_private_key.key.post_packing_ks_key;
+
+    let compute_encoding = cuda_sks.encoding();
+    let compression_encoding = ShortintEncoding {
+        carry_modulus: CarryModulus(1),
+        ..compute_encoding
+    };
+    let after_packing = cuda_glwe_list_to_glwe_ciphertext(&d_after_packing, streams);
+    let after_ms = cuda_glwe_list_to_glwe_ciphertext(&d_after_ms, streams);
+    (
+        d_before_packing
+            .into_iter()
+            .map(|(d_input, d_pbs_result, d_dp_result)| {
+                let input = d_input.as_ct_64_cpu(streams);
+                let pbs_result = d_pbs_result.as_ct_64_cpu(streams);
+                let dp_result = d_dp_result.as_ct_64_cpu(streams);
+                (
+                    match &cks.key.atomic_pattern {
+                        AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => {
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &input,
+                                &standard_atomic_pattern_client_key.lwe_secret_key,
+                                msg,
+                                &compute_encoding,
+                            )
+                        }
+                        AtomicPatternClientKey::KeySwitch32(_ks32_atomic_pattern_client_key) => {
+                            panic!("KS32 Atomic Pattern not supported on GPU tests yet");
+                        }
+                    },
+                    DecryptionAndNoiseResult::new_from_lwe(
+                        &pbs_result,
+                        &compute_large_lwe_secret_key,
+                        msg,
+                        &compute_encoding,
+                    ),
+                    DecryptionAndNoiseResult::new_from_lwe(
+                        &dp_result,
+                        &compute_large_lwe_secret_key,
+                        msg,
+                        &compression_encoding,
+                    ),
+                )
+            })
+            .collect(),
+        DecryptionAndNoiseResult::new_from_glwe(
+            &after_packing,
+            compression_glwe_secret_key,
+            compression_private_key.key.params.lwe_per_glwe(),
+            msg,
+            &compression_encoding,
+        ),
+        DecryptionAndNoiseResult::new_from_glwe(
+            &after_ms,
+            compression_glwe_secret_key,
+            compression_private_key.key.params.lwe_per_glwe(),
+            msg,
+            &compression_encoding,
+        ),
+    )
+}
+
+#[allow(clippy::type_complexity, clippy::too_many_arguments)]
+fn encrypt_br_dp_packing_ks_ms_noise_helper_gpu(
+    params: AtomicPatternParameters,
+    comp_params: CompressionParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_compression_private_key: &CompressionPrivateKeys,
+    single_cuda_compression_key: &CudaCompressionKey,
+    msg: u64,
+    streams: &CudaStreams,
+) -> (
+    Vec<(NoiseSample, NoiseSample, NoiseSample)>,
+    Vec<NoiseSample>,
+    Vec<NoiseSample>,
+) {
+    let (before_packing, after_packing, after_ms) = encrypt_br_dp_packing_ks_ms_inner_helper_gpu(
+        params,
+        comp_params,
+        single_cks,
+        single_cuda_sks,
+        single_compression_private_key,
+        single_cuda_compression_key,
+        msg,
+        streams,
+    );
+
+    (
+        before_packing
+            .into_iter()
+            .map(|(input, after_pbs, after_dp)| {
+                (
+                    input
+                        .get_noise_if_decryption_was_correct()
+                        .expect("Decryption Failed"),
+                    after_pbs
+                        .get_noise_if_decryption_was_correct()
+                        .expect("Decryption Failed"),
+                    after_dp
+                        .get_noise_if_decryption_was_correct()
+                        .expect("Decryption Failed"),
+                )
+            })
+            .collect(),
+        after_packing
+            .into_iter()
+            .map(|x| {
+                x.get_noise_if_decryption_was_correct()
+                    .expect("Decryption Failed")
+            })
+            .collect(),
+        after_ms
+            .into_iter()
+            .map(|x| {
+                x.get_noise_if_decryption_was_correct()
+                    .expect("Decryption Failed")
+            })
+            .collect(),
+    )
+}
+#[allow(clippy::type_complexity, clippy::too_many_arguments)]
+fn encrypt_br_dp_packing_ks_ms_pfail_helper_gpu(
+    params: AtomicPatternParameters,
+    comp_params: CompressionParameters,
+    single_cks: &ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_compression_private_key: &CompressionPrivateKeys,
+    single_cuda_compression_key: &CudaCompressionKey,
+    msg: u64,
+    streams: &CudaStreams,
+) -> Vec<DecryptionAndNoiseResult> {
+    let (_before_packing, _after_packing, after_ms) = encrypt_br_dp_packing_ks_ms_inner_helper_gpu(
+        params,
+        comp_params,
+        single_cks,
+        single_cuda_sks,
+        single_compression_private_key,
+        single_cuda_compression_key,
+        msg,
+        streams,
+    );
+
+    after_ms
+}
+
+fn noise_check_encrypt_br_dp_packing_ks_ms_noise_gpu(meta_params: MetaParameters) {
+    let (params, comp_params) = (
+        meta_params.compute_parameters,
+        meta_params.compression_parameters.unwrap(),
+    );
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let private_compression_key = cks.new_compression_private_key(comp_params);
+    let (compressed_compression_key, _compressed_decompression_key) =
+        cks.new_compressed_compression_decompression_keys(&private_compression_key);
+    let compression_key = compressed_compression_key.decompress();
+    let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams);
+
+    let noise_simulation_bsk =
+        NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(params);
+    let noise_simulation_packing_key =
+        NoiseSimulationLwePackingKeyswitchKey::new_from_comp_parameters(params, comp_params);
+
+    assert!(noise_simulation_bsk.matches_actual_bsk_gpu(&cuda_sks.bootstrapping_key));
+    assert!(noise_simulation_packing_key.matches_actual_shortint_comp_key(&compression_key.key));
+
+    // The multiplication done in the compression is made to move the message up at the top of the
+    // carry space, multiplying by the carry modulus achieves that
+    let dp_scalar = params.carry_modulus().0;
+
+    let noise_simulation_accumulator = NoiseSimulationGlwe::new(
+        noise_simulation_bsk.output_glwe_size().to_glwe_dimension(),
+        noise_simulation_bsk.output_polynomial_size(),
+        Variance(0.0),
+        noise_simulation_bsk.modulus(),
+    );
+
+    let lwe_per_glwe = cuda_compression_key.lwe_per_glwe;
+    let storage_modulus_log = cuda_compression_key.storage_log_modulus;
+    let br_input_modulus_log = cuda_sks.br_input_modulus_log();
+
+    let (_before_packing_sim, _after_packing_sim, after_ms_sim) = {
+        let noise_simulation = NoiseSimulationLwe::new(
+            cks.parameters().lwe_dimension(),
+            Variance(0.0),
+            NoiseSimulationModulus::from_ciphertext_modulus(cks.parameters().ciphertext_modulus()),
+        );
+        br_dp_packing_ks_ms(
+            vec![noise_simulation; lwe_per_glwe.0],
+            &noise_simulation_bsk,
+            &noise_simulation_accumulator,
+            dp_scalar,
+            &noise_simulation_packing_key,
+            storage_modulus_log,
+            &mut vec![(); lwe_per_glwe.0],
+        )
+    };
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0)
+        .map(|_| {
+            cks.key
+                .encrypt_noiseless_pbs_input_dyn_lwe(br_input_modulus_log, 0)
+        })
+        .collect();
+
+    let d_input_zeros: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaLweCiphertextList::from_lwe_ciphertext(&ct.as_lwe_64(), &streams);
+            CudaDynLwe::U64(d_ct_input)
+        })
+        .collect();
+
+    let id_lut = cuda_sks.generate_lookup_table(|x| x);
+    let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut.acc, &streams);
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(&streams, cuda_block_info))
+        .collect();
+
+    // Check that the circuit is correct with respect to core implementation, i.e. does not crash on
+    // dimension checks
+    let (expected_glwe_size_out, expected_polynomial_size_out, expected_modulus_f64_out) = {
+        let (_before_packing_sim, _after_packing, after_ms) = br_dp_packing_ks_ms(
+            d_input_zeros,
+            &cuda_sks,
+            &d_accumulator,
+            dp_scalar,
+            &cuda_compression_key.packing_key_switching_key,
+            storage_modulus_log,
+            &mut cuda_side_resources,
+        );
+
+        (
+            after_ms.glwe_dimension().to_glwe_size(),
+            after_ms.polynomial_size(),
+            after_ms.ciphertext_modulus().raw_modulus_float(),
+        )
+    };
+
+    assert_eq!(after_ms_sim.glwe_size(), expected_glwe_size_out);
+    assert_eq!(after_ms_sim.polynomial_size(), expected_polynomial_size_out);
+    assert_eq!(after_ms_sim.modulus().as_f64(), expected_modulus_f64_out);
+
+    let cleartext_modulus = params.message_modulus().0 * params.carry_modulus().0;
+    let mut noise_samples_before_ms = vec![];
+    let mut noise_samples_after_ms = vec![];
+
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    for _ in 0..cleartext_modulus {
+        let (current_noise_samples_before_ms, current_noise_samples_after_ms): (Vec<_>, Vec<_>) =
+            (0..SAMPLES_PER_MSG_PACKING_KS_NOISE)
+                .collect::<Vec<_>>()
+                .chunks(chunk_size)
+                .flat_map(|chunk| {
+                    chunk
+                        .into_par_iter()
+                        .map(|i| {
+                            let local_stream = &vec_local_streams[*i % chunk_size];
+                            let (_before_packing, after_packing, after_ms) =
+                                encrypt_br_dp_packing_ks_ms_noise_helper_gpu(
+                                    params,
+                                    comp_params,
+                                    &cks,
+                                    &cuda_sks,
+                                    &private_compression_key,
+                                    &cuda_compression_key,
+                                    0,
+                                    local_stream,
+                                );
+                            (after_packing, after_ms)
+                        })
+                        .collect::<Vec<_>>()
+                })
+                .unzip();
+
+        noise_samples_before_ms.extend(current_noise_samples_before_ms);
+        noise_samples_after_ms.extend(current_noise_samples_after_ms);
+    }
+
+    let noise_samples_before_ms_flattened: Vec<_> = noise_samples_before_ms
+        .into_iter()
+        .flatten()
+        .map(|x| x.value)
+        .collect();
+
+    let noise_samples_after_ms_flattened: Vec<_> = noise_samples_after_ms
+        .into_iter()
+        .flatten()
+        .map(|x| x.value)
+        .collect();
+
+    let before_ms_normality =
+        normality_check(&noise_samples_before_ms_flattened, "before ms", 0.01);
+
+    let after_ms_is_ok = mean_and_variance_check(
+        &noise_samples_after_ms_flattened,
+        "after_ms",
+        0.0,
+        after_ms_sim.variance_per_occupied_slot(),
+        comp_params.packing_ks_key_noise_distribution(),
+        after_ms_sim
+            .glwe_dimension()
+            .to_equivalent_lwe_dimension(after_ms_sim.polynomial_size()),
+        after_ms_sim.modulus().as_f64(),
+    );
+
+    assert!(before_ms_normality.null_hypothesis_is_valid && after_ms_is_ok);
+}
+create_gpu_parameterized_test!(noise_check_encrypt_br_dp_packing_ks_ms_noise_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+fn noise_check_encrypt_br_dp_packing_ks_ms_pfail_gpu(meta_params: MetaParameters) {
+    let (pfail_test_meta, params, comp_params) = {
+        let (mut params, comp_params) = (
+            meta_params.compute_parameters,
+            meta_params.compression_parameters.unwrap(),
+        );
+
+        let original_message_modulus = params.message_modulus();
+        let original_carry_modulus = params.carry_modulus();
+
+        // For now only allow 2_2 parameters, and see later for heuristics to use
+        assert_eq!(original_message_modulus.0, 4);
+        assert_eq!(original_carry_modulus.0, 4);
+
+        let noise_simulation_bsk =
+            NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(params);
+        let noise_simulation_packing_key =
+            NoiseSimulationLwePackingKeyswitchKey::new_from_comp_parameters(params, comp_params);
+
+        // The multiplication done in the compression is made to move the message up at the top of
+        // the carry space, multiplying by the carry modulus achieves that
+        let dp_scalar = params.carry_modulus().0;
+
+        let noise_simulation_accumulator = NoiseSimulationGlwe::new(
+            noise_simulation_bsk.output_glwe_size().to_glwe_dimension(),
+            noise_simulation_bsk.output_polynomial_size(),
+            Variance(0.0),
+            noise_simulation_bsk.modulus(),
+        );
+
+        let lwe_per_glwe = comp_params.lwe_per_glwe();
+        let storage_modulus_log = comp_params.storage_log_modulus();
+
+        let (_before_packing_sim, _after_packing_sim, after_ms_sim) = {
+            let noise_simulation = NoiseSimulationLwe::new(
+                params.lwe_dimension(),
+                Variance(0.0),
+                NoiseSimulationModulus::from_ciphertext_modulus(params.ciphertext_modulus()),
+            );
+            br_dp_packing_ks_ms(
+                vec![noise_simulation; lwe_per_glwe.0],
+                &noise_simulation_bsk,
+                &noise_simulation_accumulator,
+                dp_scalar,
+                &noise_simulation_packing_key,
+                storage_modulus_log,
+                &mut vec![(); lwe_per_glwe.0],
+            )
+        };
+
+        let expected_variance_after_storage = after_ms_sim.variance_per_occupied_slot();
+
+        let compression_carry_mod = CarryModulus(1);
+        let compression_message_mod = original_message_modulus;
+        let compression_precision_with_padding =
+            precision_with_padding(compression_message_mod, compression_carry_mod);
+        let expected_pfail_for_storage = expected_pfail_for_precision(
+            compression_precision_with_padding,
+            expected_variance_after_storage,
+        );
+
+        let original_pfail_and_precision = PfailAndPrecision::new(
+            expected_pfail_for_storage,
+            compression_message_mod,
+            compression_carry_mod,
+        );
+
+        // Here we update the message modulus only:
+        // - because the message modulus matches for the compression encoding and compute encoding
+        // - so that the carry modulus stays the same and we apply the same dot product as normal
+        //   for 2_2
+        // - so that the effective encoding after the storage is the one we used to evaluate the
+        //   pfail
+        let updated_message_mod = MessageModulus(1 << 6);
+        let updated_carry_mod = compression_carry_mod;
+
+        update_ap_params_msg_and_carry_moduli(&mut params, updated_message_mod, updated_carry_mod);
+
+        assert!(
+            (params.message_modulus().0 * params.carry_modulus().0).ilog2()
+                <= comp_params.storage_log_modulus().0 as u32,
+            "Compression storage modulus cannot store enough bits for pfail estimation"
+        );
+
+        let updated_precision_with_padding =
+            precision_with_padding(updated_message_mod, updated_carry_mod);
+
+        let new_expected_pfail_for_storage = expected_pfail_for_precision(
+            updated_precision_with_padding,
+            expected_variance_after_storage,
+        );
+
+        let new_expected_pfail_and_precision = PfailAndPrecision::new(
+            new_expected_pfail_for_storage,
+            updated_message_mod,
+            updated_carry_mod,
+        );
+
+        let pfail_test_meta = if should_run_short_pfail_tests_debug() {
+            // To have the same amount of keys generated as the case where a single run is a single
+            // sample
+            let expected_fails = 200 * lwe_per_glwe.0 as u32;
+            PfailTestMeta::new_with_desired_expected_fails(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                expected_fails,
+            )
+        } else {
+            // To guarantee 1_000_000 keysets are generated
+            let total_runs = 1_000_000 * lwe_per_glwe.0 as u32;
+            PfailTestMeta::new_with_total_runs(
+                original_pfail_and_precision,
+                new_expected_pfail_and_precision,
+                total_runs,
+            )
+        };
+
+        (pfail_test_meta, params, comp_params)
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let private_compression_key = cks.new_compression_private_key(comp_params);
+    let (compressed_compression_key, _compressed_decompression_key) =
+        cks.new_compressed_compression_decompression_keys(&private_compression_key);
+
+    let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams);
+
+    let lwe_per_glwe = cuda_compression_key.lwe_per_glwe;
+
+    let total_runs_for_expected_fails = pfail_test_meta
+        .total_runs_for_expected_fails()
+        .div_ceil(lwe_per_glwe.0.try_into().unwrap());
+
+    let chunk_size = 8;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+
+    let measured_fails: f64 = (0..total_runs_for_expected_fails)
+        .collect::<Vec<_>>()
+        .chunks(chunk_size)
+        .flat_map(|chunk| {
+            chunk
+                .into_par_iter()
+                .map(|i| {
+                    let local_streams = &vec_local_streams[*i as usize % chunk_size];
+                    let after_ms_decryption_result = encrypt_br_dp_packing_ks_ms_pfail_helper_gpu(
+                        params,
+                        comp_params,
+                        &cks,
+                        &cuda_sks,
+                        &private_compression_key,
+                        &cuda_compression_key,
+                        0,
+                        local_streams,
+                    );
+                    after_ms_decryption_result
+                        .into_iter()
+                        .map(|result| result.failure_as_f64())
+                        .sum::<f64>()
+                })
+                .collect::<Vec<_>>()
+        })
+        .sum();
+
+    let test_result = PfailTestResult { measured_fails };
+
+    pfail_check(&pfail_test_meta, test_result);
+}
+
+create_gpu_parameterized_test!(noise_check_encrypt_br_dp_packing_ks_ms_pfail_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_pbs_128_packingks.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/dp_ks_pbs_128_packingks.rs
@@ -0,0 +1,869 @@
+use super::utils::noise_simulation::{CudaDynLwe, CudaSideResources};
+use crate::core_crypto::commons::noise_formulas::noise_simulation::{
+    NoiseSimulationLweFourier128Bsk, NoiseSimulationLwePackingKeyswitchKey,
+};
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{GlweCiphertext, LweCiphertextCount};
+use crate::integer::gpu::CudaServerKey;
+use crate::integer::noise_squashing::NoiseSquashingPrivateKey;
+use crate::integer::CompressedServerKey;
+
+use crate::core_crypto::commons::parameters::CiphertextModulusLog;
+use crate::core_crypto::prelude::generate_programmable_bootstrap_glwe_lut;
+use crate::integer::ciphertext::NoiseSquashingCompressionPrivateKey;
+use crate::integer::gpu::list_compression::server_keys::CudaNoiseSquashingCompressionKey;
+use crate::integer::gpu::server_key::radix::tests_unsigned::create_gpu_parameterized_test;
+use crate::integer::gpu::server_key::radix::{CudaNoiseSquashingKey, CudaUnsignedRadixCiphertext};
+use crate::integer::gpu::unchecked_small_scalar_mul_integer_async;
+use crate::integer::IntegerCiphertext;
+use crate::shortint::client_key::atomic_pattern::AtomicPatternClientKey;
+use crate::shortint::parameters::noise_squashing::NoiseSquashingParameters;
+use crate::shortint::parameters::test_params::TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128;
+use crate::shortint::parameters::{
+    AtomicPatternParameters, MetaParameters, NoiseSquashingCompressionParameters, Variance,
+};
+use crate::shortint::server_key::tests::noise_distribution::dp_ks_pbs128_packingks::{
+    dp_ks_any_ms_standard_pbs128, dp_ks_any_ms_standard_pbs128_packing_ks,
+};
+use crate::shortint::server_key::tests::noise_distribution::should_use_single_key_debug;
+use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::{
+    NoiseSimulationGlwe, NoiseSimulationLwe, NoiseSimulationLweFourierBsk,
+    NoiseSimulationLweKeyswitchKey, NoiseSimulationModulusSwitchConfig,
+};
+use crate::shortint::server_key::tests::noise_distribution::utils::{
+    mean_and_variance_check, DecryptionAndNoiseResult, NoiseSample,
+};
+use crate::shortint::{PaddingBit, ShortintEncoding, ShortintParameterSet};
+use crate::GpuIndex;
+use rayon::prelude::*;
+
+/// Test function to verify that the noise checking tools match the actual atomic patterns
+/// implemented in shortint for GPU
+fn sanity_check_encrypt_dp_ks_standard_pbs128_packing_ks_gpu(meta_params: MetaParameters) {
+    let (atomic_params, noise_squashing_params, noise_squashing_compression_params) = {
+        let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap();
+        (
+            meta_params.compute_parameters,
+            meta_noise_squashing_params.parameters,
+            meta_noise_squashing_params.compression_parameters.unwrap(),
+        )
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = atomic_params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params);
+    let compressed_noise_squashing_compression_key =
+        cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
+    let noise_squashing_key = compressed_noise_squashing_compression_key.decompress();
+    let cuda_noise_squashing_key =
+        compressed_noise_squashing_compression_key.decompress_to_cuda(&streams);
+    let noise_squashing_compression_private_key =
+        NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params);
+    let noise_squashing_compression_key = noise_squashing_private_key
+        .new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
+    let cuda_noise_squashing_compression_key =
+        CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
+            &noise_squashing_compression_key,
+            &streams,
+        );
+
+    let lwe_per_glwe = cuda_noise_squashing_compression_key.lwe_per_glwe;
+
+    let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config();
+
+    let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log();
+
+    let u128_encoding = ShortintEncoding {
+        ciphertext_modulus: noise_squashing_params.ciphertext_modulus(),
+        message_modulus: noise_squashing_params.message_modulus(),
+        carry_modulus: noise_squashing_params.carry_modulus(),
+        padding_bit: PaddingBit::Yes,
+    };
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let id_lut_cpu = generate_programmable_bootstrap_glwe_lut(
+        noise_squashing_key.key.polynomial_size(),
+        noise_squashing_key.key.glwe_size(),
+        u128_encoding
+            .cleartext_space_without_padding()
+            .try_into()
+            .unwrap(),
+        u128_encoding.ciphertext_modulus,
+        u128_encoding.delta(),
+        |x| x,
+    );
+
+    let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, &streams);
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(0)).collect();
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(atomic_params.message_modulus().0 - 1),
+        message_modulus: atomic_params.message_modulus(),
+        carry_modulus: atomic_params.carry_modulus(),
+        atomic_pattern: atomic_params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(&streams, cuda_block_info))
+        .collect();
+
+    let input_zero_as_lwe: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]),
+                &streams,
+            );
+            CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks)
+        })
+        .collect();
+
+    let (_before_packing, d_after_packing) = dp_ks_any_ms_standard_pbs128_packing_ks(
+        input_zero_as_lwe,
+        max_scalar_mul,
+        &cuda_sks,
+        modulus_switch_config,
+        &cuda_noise_squashing_key,
+        br_input_modulus_log,
+        &id_lut_gpu,
+        &cuda_noise_squashing_compression_key.packing_key_switching_key,
+        &mut cuda_side_resources,
+    );
+
+    let cuda_noise_squashed_cts: Vec<_> = input_zeros
+        .into_par_iter()
+        .map(|ct| {
+            let cloned_ct = ct;
+            let radix_ct = crate::integer::RadixCiphertext::from_blocks(vec![cloned_ct]);
+            let mut d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&radix_ct, &streams);
+            unsafe {
+                unchecked_small_scalar_mul_integer_async(
+                    &streams,
+                    &mut d_ct.ciphertext,
+                    max_scalar_mul,
+                    atomic_params.message_modulus(),
+                    atomic_params.carry_modulus(),
+                );
+            }
+            streams.synchronize();
+            cuda_noise_squashing_key.unchecked_squash_ciphertext_noise(
+                &d_ct.ciphertext,
+                &cuda_sks,
+                &streams,
+            )
+        })
+        .collect();
+
+    let gpu_compressed = cuda_noise_squashing_compression_key
+        .compress_noise_squashed_ciphertexts_into_list(&cuda_noise_squashed_cts, &streams);
+
+    let gpu_extracted = gpu_compressed.extract_glwe(0, &streams);
+    let extracted_list = gpu_extracted.to_glwe_ciphertext_list(&streams);
+    let extracted_glwe = GlweCiphertext::from_container(
+        extracted_list.clone().into_container(),
+        extracted_list.polynomial_size(),
+        extracted_list.ciphertext_modulus(),
+    );
+
+    let after_packing_list = d_after_packing.to_glwe_ciphertext_list(&streams);
+    let mut after_packing = GlweCiphertext::from_container(
+        after_packing_list.clone().into_container(),
+        after_packing_list.polynomial_size(),
+        after_packing_list.ciphertext_modulus(),
+    );
+    // Bodies that were not filled are discarded
+    after_packing.get_mut_body().as_mut()[lwe_per_glwe.0..].fill(0);
+
+    assert_eq!(after_packing.as_view(), extracted_glwe.as_view());
+}
+
+/// Test function to verify that the noise checking tools match the actual atomic patterns
+/// implemented in shortint for GPU
+fn sanity_check_encrypt_dp_ks_standard_pbs128_gpu(meta_params: MetaParameters) {
+    let (params, noise_squashing_params) = {
+        let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap();
+        (
+            meta_params.compute_parameters,
+            meta_noise_squashing_params.parameters,
+        )
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params);
+    let compressed_noise_squashing_compression_key =
+        cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
+    let noise_squashing_key = compressed_noise_squashing_compression_key.decompress();
+    let cuda_noise_squashing_key =
+        compressed_noise_squashing_compression_key.decompress_to_cuda(&streams);
+
+    let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config();
+
+    let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log();
+
+    let u128_encoding = ShortintEncoding {
+        ciphertext_modulus: noise_squashing_params.ciphertext_modulus(),
+        message_modulus: noise_squashing_params.message_modulus(),
+        carry_modulus: noise_squashing_params.carry_modulus(),
+        padding_bit: PaddingBit::Yes,
+    };
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let id_lut_cpu = generate_programmable_bootstrap_glwe_lut(
+        noise_squashing_key.key.polynomial_size(),
+        noise_squashing_key.key.glwe_size(),
+        u128_encoding
+            .cleartext_space_without_padding()
+            .try_into()
+            .unwrap(),
+        u128_encoding.ciphertext_modulus,
+        u128_encoding.delta(),
+        |x| x,
+    );
+
+    let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, &streams);
+
+    let lwe_per_glwe = LweCiphertextCount(128);
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(0)).collect();
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(&streams, cuda_block_info))
+        .collect();
+
+    let input_zero_as_lwe: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]),
+                &streams,
+            );
+            CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks)
+        })
+        .collect();
+
+    let res: Vec<_> = input_zero_as_lwe
+        .into_par_iter()
+        .zip(cuda_side_resources.par_iter_mut())
+        .map(|(input, side_resources)| {
+            let (input, after_dp, ks_result, drift_technique_result, ms_result, pbs_result) =
+                dp_ks_any_ms_standard_pbs128(
+                    input,
+                    max_scalar_mul,
+                    &cuda_sks,
+                    modulus_switch_config,
+                    &cuda_noise_squashing_key,
+                    br_input_modulus_log,
+                    &id_lut_gpu,
+                    side_resources,
+                );
+
+            (
+                input,
+                after_dp,
+                ks_result,
+                drift_technique_result,
+                ms_result,
+                pbs_result,
+            )
+        })
+        .collect();
+
+    let input_zeros_non_pattern: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]),
+                &streams,
+            )
+        })
+        .collect();
+
+    let vector_non_pattern: Vec<_> = input_zeros_non_pattern
+        .into_par_iter()
+        .map(|mut d_ct_input2| {
+            unsafe {
+                unchecked_small_scalar_mul_integer_async(
+                    &streams,
+                    &mut d_ct_input2.ciphertext,
+                    max_scalar_mul,
+                    params.message_modulus(),
+                    params.carry_modulus(),
+                );
+            }
+
+            streams.synchronize();
+
+            cuda_noise_squashing_key
+                .squash_radix_ciphertext_noise(&cuda_sks, &d_ct_input2.ciphertext, &streams)
+                .unwrap()
+        })
+        .collect();
+
+    let vector_pattern_cpu: Vec<_> = res
+        .into_iter()
+        .map(
+            |(_input, _after_dp, _ks_result, _drift_technique_result, _ms_result, pbs_result)| {
+                pbs_result.as_ct_128_cpu(&streams)
+            },
+        )
+        .collect();
+
+    let vector_non_pattern_cpu: Vec<_> = vector_non_pattern
+        .into_par_iter()
+        .map(|cuda_squashed_radix_ct| {
+            let squashed_noise_ct_cpu =
+                cuda_squashed_radix_ct.to_squashed_noise_radix_ciphertext(&streams);
+            squashed_noise_ct_cpu.packed_blocks()[0]
+                .lwe_ciphertext()
+                .clone()
+        })
+        .collect();
+
+    // Compare that all the results are equivalent
+    assert_eq!(vector_pattern_cpu, vector_non_pattern_cpu);
+}
+
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+fn encrypt_dp_ks_standard_pbs128_packing_ks_inner_helper_gpu(
+    params: AtomicPatternParameters,
+    noise_squashing_params: NoiseSquashingParameters,
+    noise_squashing_compression_params: NoiseSquashingCompressionParameters,
+    single_cks: &crate::integer::ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_noise_squashing_private_key: &NoiseSquashingPrivateKey,
+    single_noise_squashing_key: &crate::integer::noise_squashing::NoiseSquashingKey,
+    single_cuda_noise_squashing_key: &CudaNoiseSquashingKey,
+    single_noise_squashing_compression_private_key: &NoiseSquashingCompressionPrivateKey,
+    single_cuda_noise_squashing_compression_key: &CudaNoiseSquashingCompressionKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    Vec<(
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+        DecryptionAndNoiseResult,
+    )>,
+    Vec<DecryptionAndNoiseResult>,
+) {
+    let thread_cks: crate::integer::ClientKey;
+    let thread_cuda_sks: CudaServerKey;
+    let thread_noise_squashing_private_key: NoiseSquashingPrivateKey;
+    let thread_noise_squashing_key: crate::integer::noise_squashing::NoiseSquashingKey;
+    let thread_cuda_noise_squashing_key: CudaNoiseSquashingKey;
+    let thread_noise_squashing_compression_private_key: NoiseSquashingCompressionPrivateKey;
+    let thread_cuda_noise_squashing_compression_key: CudaNoiseSquashingCompressionKey;
+    let (
+        cks,
+        cuda_sks,
+        noise_squashing_private_key,
+        noise_squashing_key,
+        cuda_noise_squashing_key,
+        noise_squashing_compression_private_key,
+        cuda_noise_squashing_compression_key,
+    ) = if should_use_single_key_debug() {
+        (
+            single_cks,
+            single_cuda_sks,
+            single_noise_squashing_private_key,
+            single_noise_squashing_key,
+            single_cuda_noise_squashing_key,
+            single_noise_squashing_compression_private_key,
+            single_cuda_noise_squashing_compression_key,
+        )
+    } else {
+        let block_params: ShortintParameterSet = params.into();
+        thread_cks = crate::integer::ClientKey::new(block_params);
+        let thread_compressed_server_key =
+            CompressedServerKey::new_radix_compressed_server_key(&thread_cks);
+        thread_cuda_sks =
+            CudaServerKey::decompress_from_cpu(&thread_compressed_server_key, streams);
+
+        thread_noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params);
+        let thread_compressed_noise_squashing_compression_key =
+            thread_cks.new_compressed_noise_squashing_key(&thread_noise_squashing_private_key);
+        thread_noise_squashing_key = thread_compressed_noise_squashing_compression_key.decompress();
+        thread_cuda_noise_squashing_key =
+            thread_compressed_noise_squashing_compression_key.decompress_to_cuda(streams);
+        thread_noise_squashing_compression_private_key =
+            NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params);
+        let thread_noise_squashing_compression_key = thread_noise_squashing_private_key
+            .new_noise_squashing_compression_key(&thread_noise_squashing_compression_private_key);
+        thread_cuda_noise_squashing_compression_key =
+            CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
+                &thread_noise_squashing_compression_key,
+                streams,
+            );
+        (
+            &thread_cks,
+            &thread_cuda_sks,
+            &thread_noise_squashing_private_key,
+            &thread_noise_squashing_key,
+            &thread_cuda_noise_squashing_key,
+            &thread_noise_squashing_compression_private_key,
+            &thread_cuda_noise_squashing_compression_key,
+        )
+    };
+
+    let modulus_switch_config = cuda_noise_squashing_key.noise_simulation_modulus_switch_config();
+
+    let bsk_polynomial_size = noise_squashing_key.key.polynomial_size();
+    let bsk_glwe_size = noise_squashing_key.key.glwe_size();
+
+    let u128_encoding = ShortintEncoding {
+        ciphertext_modulus: noise_squashing_params.ciphertext_modulus(),
+        message_modulus: noise_squashing_params.message_modulus(),
+        carry_modulus: noise_squashing_params.carry_modulus(),
+        padding_bit: PaddingBit::Yes,
+    };
+
+    let id_lut_cpu = generate_programmable_bootstrap_glwe_lut(
+        bsk_polynomial_size,
+        bsk_glwe_size,
+        u128_encoding
+            .cleartext_space_without_padding()
+            .try_into()
+            .unwrap(),
+        u128_encoding.ciphertext_modulus,
+        u128_encoding.delta(),
+        |x| x,
+    );
+    let id_lut_gpu = CudaGlweCiphertextList::from_glwe_ciphertext(&id_lut_cpu, streams);
+
+    let lwe_per_glwe = cuda_noise_squashing_compression_key.lwe_per_glwe;
+
+    let input_zeros: Vec<_> = (0..lwe_per_glwe.0).map(|_| cks.key.encrypt(msg)).collect();
+
+    let cuda_block_info = crate::integer::gpu::ciphertext::info::CudaBlockInfo {
+        degree: crate::shortint::ciphertext::Degree::new(params.message_modulus().0 - 1),
+        message_modulus: params.message_modulus(),
+        carry_modulus: params.carry_modulus(),
+        atomic_pattern: params.atomic_pattern(),
+        noise_level: crate::shortint::parameters::NoiseLevel::NOMINAL,
+    };
+    let mut cuda_side_resources: Vec<CudaSideResources> = (0..input_zeros.len())
+        .map(|_| CudaSideResources::new(streams, cuda_block_info))
+        .collect();
+
+    let input_zero_as_lwe: Vec<_> = input_zeros
+        .iter()
+        .map(|ct| {
+            let d_ct_input = CudaUnsignedRadixCiphertext::from_radix_ciphertext(
+                &crate::integer::RadixCiphertext::from_blocks(vec![ct.clone()]),
+                streams,
+            );
+            CudaDynLwe::U64(d_ct_input.ciphertext.d_blocks)
+        })
+        .collect();
+
+    let (before_packing_gpu, after_packing_gpu) = dp_ks_any_ms_standard_pbs128_packing_ks(
+        input_zero_as_lwe,
+        scalar_for_multiplication,
+        cuda_sks,
+        modulus_switch_config,
+        cuda_noise_squashing_key,
+        br_input_modulus_log,
+        &id_lut_gpu,
+        &cuda_noise_squashing_compression_key.packing_key_switching_key,
+        &mut cuda_side_resources,
+    );
+
+    let before_packing: Vec<_> = before_packing_gpu
+        .into_iter()
+        .map(
+            |(
+                input_gpu,
+                after_dp_gpu,
+                after_ks_gpu,
+                after_drift_gpu,
+                after_ms_gpu,
+                after_pbs128_gpu,
+            )| {
+                match &cks.key.atomic_pattern {
+                    AtomicPatternClientKey::Standard(standard_atomic_pattern_client_key) => {
+                        let params = standard_atomic_pattern_client_key.parameters;
+                        let u64_encoding = ShortintEncoding {
+                            ciphertext_modulus: params.ciphertext_modulus(),
+                            message_modulus: params.message_modulus(),
+                            carry_modulus: params.carry_modulus(),
+                            padding_bit: PaddingBit::Yes,
+                        };
+                        let large_lwe_secret_key =
+                            standard_atomic_pattern_client_key.large_lwe_secret_key();
+                        let small_lwe_secret_key =
+                            standard_atomic_pattern_client_key.small_lwe_secret_key();
+
+                        let input_ct = input_gpu.as_ct_64_cpu(streams);
+                        let after_dp_ct = after_dp_gpu.as_ct_64_cpu(streams);
+                        let after_ks_ct = after_ks_gpu.as_ct_64_cpu(streams);
+                        let before_ms_gpu: &CudaDynLwe =
+                            after_drift_gpu.as_ref().unwrap_or(&after_ks_gpu);
+                        let before_ms_ct = before_ms_gpu.as_ct_64_cpu(streams);
+                        let after_ms_ct = after_ms_gpu.as_ct_64_cpu(streams);
+                        let after_pbs128_ct = after_pbs128_gpu.as_ct_128_cpu(streams);
+                        (
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &input_ct,
+                                &large_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &after_dp_ct,
+                                &large_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &after_ks_ct,
+                                &small_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &before_ms_ct,
+                                &small_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &after_ms_ct,
+                                &small_lwe_secret_key,
+                                msg,
+                                &u64_encoding,
+                            ),
+                            DecryptionAndNoiseResult::new_from_lwe(
+                                &after_pbs128_ct,
+                                &noise_squashing_private_key
+                                    .key
+                                    .post_noise_squashing_lwe_secret_key(),
+                                msg.into(),
+                                &u128_encoding,
+                            ),
+                        )
+                    }
+                    AtomicPatternClientKey::KeySwitch32(_ks32_atomic_pattern_client_key) => {
+                        panic!("KS32 atomic pattern not supported for GPU yet");
+                    }
+                }
+            },
+        )
+        .collect();
+    let after_packing_list = after_packing_gpu.to_glwe_ciphertext_list(streams);
+    let after_packing = GlweCiphertext::from_container(
+        after_packing_list.clone().into_container(),
+        after_packing_list.polynomial_size(),
+        after_packing_list.ciphertext_modulus(),
+    );
+    let after_packing = DecryptionAndNoiseResult::new_from_glwe(
+        &after_packing,
+        noise_squashing_compression_private_key
+            .key
+            .post_packing_ks_key(),
+        lwe_per_glwe,
+        msg.into(),
+        &u128_encoding,
+    );
+
+    assert_eq!(after_packing.len(), lwe_per_glwe.0);
+
+    (before_packing, after_packing)
+}
+
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+fn encrypt_dp_ks_standard_pbs128_packing_ks_noise_helper_gpu(
+    params: AtomicPatternParameters,
+    noise_squashing_params: NoiseSquashingParameters,
+    noise_squashing_compression_params: NoiseSquashingCompressionParameters,
+    single_cks: &crate::integer::ClientKey,
+    single_cuda_sks: &CudaServerKey,
+    single_noise_squashing_private_key: &NoiseSquashingPrivateKey,
+    single_noise_squashing_key: &crate::integer::noise_squashing::NoiseSquashingKey,
+    single_cuda_noise_squashing_key: &CudaNoiseSquashingKey,
+    single_noise_squashing_compression_private_key: &NoiseSquashingCompressionPrivateKey,
+    single_cuda_noise_squashing_compression_key: &CudaNoiseSquashingCompressionKey,
+    msg: u64,
+    scalar_for_multiplication: u64,
+    br_input_modulus_log: CiphertextModulusLog,
+    streams: &CudaStreams,
+) -> (
+    Vec<(
+        NoiseSample,
+        NoiseSample,
+        NoiseSample,
+        NoiseSample,
+        NoiseSample,
+        NoiseSample,
+    )>,
+    Vec<NoiseSample>,
+) {
+    let (before_compression, after_compression) =
+        encrypt_dp_ks_standard_pbs128_packing_ks_inner_helper_gpu(
+            params,
+            noise_squashing_params,
+            noise_squashing_compression_params,
+            single_cks,
+            single_cuda_sks,
+            single_noise_squashing_private_key,
+            single_noise_squashing_key,
+            single_cuda_noise_squashing_key,
+            single_noise_squashing_compression_private_key,
+            single_cuda_noise_squashing_compression_key,
+            msg,
+            scalar_for_multiplication,
+            br_input_modulus_log,
+            streams,
+        );
+
+    (
+        before_compression
+            .into_iter()
+            .map(
+                |(input, after_dp, after_ks, after_drift, after_ms, after_pbs)| {
+                    (
+                        input
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_dp
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_ks
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_drift
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_ms
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                        after_pbs
+                            .get_noise_if_decryption_was_correct()
+                            .expect("Decryption Failed"),
+                    )
+                },
+            )
+            .collect(),
+        after_compression
+            .into_iter()
+            .map(|after_compression| {
+                after_compression
+                    .get_noise_if_decryption_was_correct()
+                    .expect("Decryption Failed")
+            })
+            .collect(),
+    )
+}
+
+fn noise_check_encrypt_dp_ks_standard_pbs128_packing_ks_noise_gpu(meta_params: MetaParameters) {
+    let (atomic_params, noise_squashing_params, noise_squashing_compression_params) = {
+        let meta_noise_squashing_params = meta_params.noise_squashing_parameters.unwrap();
+        (
+            meta_params.compute_parameters,
+            meta_noise_squashing_params.parameters,
+            meta_noise_squashing_params.compression_parameters.unwrap(),
+        )
+    };
+    let gpu_index = 0;
+    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+
+    let block_params: ShortintParameterSet = atomic_params.into();
+    let cks = crate::integer::ClientKey::new(block_params);
+    let compressed_server_key = CompressedServerKey::new_radix_compressed_server_key(&cks);
+    let cuda_sks = CudaServerKey::decompress_from_cpu(&compressed_server_key, &streams);
+
+    let noise_squashing_private_key = NoiseSquashingPrivateKey::new(noise_squashing_params);
+    let compressed_noise_squashing_compression_key =
+        cks.new_compressed_noise_squashing_key(&noise_squashing_private_key);
+    let noise_squashing_key = compressed_noise_squashing_compression_key.decompress();
+    let cuda_noise_squashing_key =
+        compressed_noise_squashing_compression_key.decompress_to_cuda(&streams);
+    let noise_squashing_compression_private_key =
+        NoiseSquashingCompressionPrivateKey::new(noise_squashing_compression_params);
+    let noise_squashing_compression_key = noise_squashing_private_key
+        .new_noise_squashing_compression_key(&noise_squashing_compression_private_key);
+    let cuda_noise_squashing_compression_key =
+        CudaNoiseSquashingCompressionKey::from_noise_squashing_compression_key(
+            &noise_squashing_compression_key,
+            &streams,
+        );
+
+    let noise_simulation_ksk =
+        NoiseSimulationLweKeyswitchKey::new_from_atomic_pattern_parameters(atomic_params);
+    let noise_simulation_bsk =
+        NoiseSimulationLweFourierBsk::new_from_atomic_pattern_parameters(atomic_params);
+    let noise_simulation_modulus_switch_config =
+        NoiseSimulationModulusSwitchConfig::new_from_atomic_pattern_parameters(atomic_params);
+    let noise_simulation_bsk128 =
+        NoiseSimulationLweFourier128Bsk::new_from_parameters(atomic_params, noise_squashing_params);
+    let noise_simulation_packing_key =
+        NoiseSimulationLwePackingKeyswitchKey::new_from_noise_squashing_parameters(
+            noise_squashing_params,
+            noise_squashing_compression_params,
+        );
+
+    assert!(noise_simulation_bsk.matches_actual_bsk_gpu(&cuda_sks.bootstrapping_key));
+
+    assert!(noise_simulation_bsk128
+        .matches_actual_shortint_noise_squashing_key(&noise_squashing_key.key));
+    assert!(noise_simulation_packing_key.matches_actual_pksk(
+        noise_squashing_compression_key
+            .key
+            .packing_key_switching_key()
+    ));
+
+    let br_input_modulus_log = noise_squashing_key.key.br_input_modulus_log();
+
+    let max_scalar_mul = cuda_sks.max_noise_level.get();
+
+    let noise_simulation_accumulator = NoiseSimulationGlwe::new(
+        noise_simulation_bsk128
+            .output_glwe_size()
+            .to_glwe_dimension(),
+        noise_simulation_bsk128.output_polynomial_size(),
+        Variance(0.0),
+        noise_simulation_bsk128.modulus(),
+    );
+
+    let (_before_packing_sim, after_packing_sim) = {
+        let noise_simulation = NoiseSimulationLwe::encrypt(&cks.key, 0);
+        dp_ks_any_ms_standard_pbs128_packing_ks(
+            vec![noise_simulation; cuda_noise_squashing_compression_key.lwe_per_glwe.0],
+            max_scalar_mul,
+            &noise_simulation_ksk,
+            noise_simulation_modulus_switch_config.as_ref(),
+            &noise_simulation_bsk128,
+            br_input_modulus_log,
+            &noise_simulation_accumulator,
+            &noise_simulation_packing_key,
+            &mut vec![(); cuda_noise_squashing_compression_key.lwe_per_glwe.0],
+        )
+    };
+
+    let after_packing_sim = after_packing_sim.into_lwe();
+
+    // Check that the circuit is correct with respect to core implementation, i.e. does not crash on
+    // dimension checks
+    let (expected_lwe_dimension_out, expected_modulus_f64_out) = {
+        let pksk = noise_squashing_compression_key
+            .key
+            .packing_key_switching_key();
+
+        let out_glwe_dim = pksk.output_key_glwe_dimension();
+        let out_poly_size = pksk.output_key_polynomial_size();
+
+        (
+            out_glwe_dim.to_equivalent_lwe_dimension(out_poly_size),
+            pksk.ciphertext_modulus().raw_modulus_float(),
+        )
+    };
+
+    assert_eq!(
+        after_packing_sim.lwe_dimension(),
+        expected_lwe_dimension_out
+    );
+    assert_eq!(
+        after_packing_sim.modulus().as_f64(),
+        expected_modulus_f64_out
+    );
+
+    let cleartext_modulus = atomic_params.message_modulus().0 * atomic_params.carry_modulus().0;
+    let mut noise_samples_after_packing = vec![];
+
+    let sample_count_per_msg =
+        1000usize.div_ceil(cuda_noise_squashing_compression_key.lwe_per_glwe.0);
+    let chunk_size = 4;
+    let vec_local_streams = (0..chunk_size)
+        .map(|_| CudaStreams::new_single_gpu(GpuIndex::new(gpu_index)))
+        .collect::<Vec<_>>();
+    for _i in 0..cleartext_modulus {
+        let current_noise_samples_after_packing: Vec<_> = (0..sample_count_per_msg)
+            .collect::<Vec<_>>()
+            .chunks(chunk_size)
+            .flat_map(|chunk| {
+                chunk
+                    .into_par_iter()
+                    .map(|i| {
+                        let local_stream = &vec_local_streams[*i % chunk_size];
+                        let (_before_packing, after_packing) =
+                            encrypt_dp_ks_standard_pbs128_packing_ks_noise_helper_gpu(
+                                atomic_params,
+                                noise_squashing_params,
+                                noise_squashing_compression_params,
+                                &cks,
+                                &cuda_sks,
+                                &noise_squashing_private_key,
+                                &noise_squashing_key,
+                                &cuda_noise_squashing_key,
+                                &noise_squashing_compression_private_key,
+                                &cuda_noise_squashing_compression_key,
+                                0,
+                                max_scalar_mul,
+                                br_input_modulus_log,
+                                local_stream,
+                            );
+                        after_packing
+                    })
+                    .collect::<Vec<_>>()
+            })
+            .collect();
+
+        noise_samples_after_packing.extend(current_noise_samples_after_packing);
+    }
+
+    let noise_samples_after_packing_flattened: Vec<_> = noise_samples_after_packing
+        .into_iter()
+        .flatten()
+        .map(|x| x.value)
+        .collect();
+
+    let after_packing_is_ok = mean_and_variance_check(
+        &noise_samples_after_packing_flattened,
+        "after_packing",
+        0.0,
+        after_packing_sim.variance(),
+        noise_squashing_compression_params.packing_ks_key_noise_distribution,
+        after_packing_sim.lwe_dimension(),
+        after_packing_sim.modulus().as_f64(),
+    );
+
+    assert!(after_packing_is_ok);
+}
+
+create_gpu_parameterized_test!(
+    noise_check_encrypt_dp_ks_standard_pbs128_packing_ks_noise_gpu {
+        TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+    }
+);
+
+create_gpu_parameterized_test!(sanity_check_encrypt_dp_ks_standard_pbs128_packing_ks_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
+
+create_gpu_parameterized_test!(sanity_check_encrypt_dp_ks_standard_pbs128_gpu {
+    TEST_META_PARAM_CPU_2_2_KS_PBS_PKE_TO_SMALL_ZKV2_TUNIFORM_2M128,
+});
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/mod.rs
@@ -1,3 +1,5 @@
 pub mod br_dp_ks_ms;
+pub mod br_dp_packingks_ms;
 pub mod dp_ks_ms;
+pub mod dp_ks_pbs_128_packingks;
 pub mod utils;
--- a/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/tests_noise_distribution/utils/noise_simulation.rs
@@ -1,7 +1,7 @@
 use crate::core_crypto::commons::noise_formulas::noise_simulation::traits::{
    AllocateCenteredBinaryShiftedStandardModSwitchResult,
    AllocateDriftTechniqueStandardModSwitchResult, AllocateLweBootstrapResult,
-    AllocateLweKeyswitchResult, AllocateStandardModSwitchResult,
+    AllocateLweKeyswitchResult, AllocateLwePackingKeyswitchResult, AllocateStandardModSwitchResult,
    CenteredBinaryShiftedStandardModSwitch, DriftTechniqueStandardModSwitch,
    LweClassicFftBootstrap, LweKeyswitch, ScalarMul, StandardModSwitch,
 };
@@ -13,6 +13,7 @@ use crate::core_crypto::gpu::cuda_modulus_switch_ciphertext;
 use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
 use crate::core_crypto::gpu::lwe_bootstrap_key::CudaModulusSwitchNoiseReductionConfiguration;
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::prelude::*;
 use crate::integer::gpu::ciphertext::info::CudaBlockInfo;
@@ -25,7 +26,7 @@ use crate::integer::gpu::{
    cuda_centered_modulus_switch_64, unchecked_small_scalar_mul_integer_async, CudaStreams,
 };
 use crate::shortint::server_key::tests::noise_distribution::utils::noise_simulation::NoiseSimulationModulusSwitchConfig;
-
+use crate::shortint::server_key::tests::noise_distribution::utils::traits::LwePackingKeyswitch;
 /// Side resources for CUDA operations in noise simulation
 #[derive(Clone)]
 pub struct CudaSideResources {
@@ -128,6 +129,19 @@ impl CudaDynLwe {
        }
    }

+    pub fn as_ct_128_cpu(&self, streams: &CudaStreams) -> LweCiphertext<Vec<u128>> {
+        match self {
+            Self::U32(_) => panic!("Tried getting a u32 CudaLweCiphertextList as u128."),
+            Self::U64(_) => panic!("Tried getting a u64 CudaLweCiphertextList as u128."),
+            Self::U128(_cuda_lwe) => {
+                let cpu_lwe_list = self.as_lwe_128().to_lwe_ciphertext_list(streams);
+                LweCiphertext::from_container(
+                    cpu_lwe_list.clone().into_container(),
+                    cpu_lwe_list.ciphertext_modulus(),
+                )
+            }
+        }
+    }
    pub fn from_lwe_32(cuda_lwe: CudaLweCiphertextList<u32>) -> Self {
        Self::U32(cuda_lwe)
    }
@@ -141,6 +155,19 @@ impl CudaDynLwe {
    }
 }

+/// Converts a CudaGlweCiphertextList<u64> to a GlweCiphertext<Vec<u64>>
+pub fn cuda_glwe_list_to_glwe_ciphertext(
+    cuda_glwe_list: &CudaGlweCiphertextList<u64>,
+    streams: &CudaStreams,
+) -> GlweCiphertext<Vec<u64>> {
+    let cpu_glwe_list = cuda_glwe_list.to_glwe_ciphertext_list(streams);
+    GlweCiphertext::from_container(
+        cpu_glwe_list.clone().into_container(),
+        cpu_glwe_list.polynomial_size(),
+        cpu_glwe_list.ciphertext_modulus(),
+    )
+}
+
 impl ScalarMul<u64> for CudaDynLwe {
    type Output = Self;
    type SideResources = CudaSideResources;
@@ -313,13 +340,14 @@ impl StandardModSwitch<Self> for CudaDynLwe {
                panic!("U32 modulus switch not implemented for CudaDynLwe - only U64 is supported");
            }
            (Self::U64(input), Self::U64(output_cuda_lwe)) => {
-                let internal_output = input.duplicate(&side_resources.streams);
+                let mut internal_output = input.duplicate(&side_resources.streams);
                cuda_modulus_switch_ciphertext(
-                    &mut output_cuda_lwe.0.d_vec,
+                    &mut internal_output.0.d_vec,
                    output_modulus_log.0 as u32,
                    &side_resources.streams,
                );
                let mut cpu_lwe = internal_output.to_lwe_ciphertext_list(&side_resources.streams);
+
                let shift_to_map_to_native = u64::BITS - output_modulus_log.0 as u32;
                for val in cpu_lwe.as_mut_view().into_container().iter_mut() {
                    *val <<= shift_to_map_to_native;
@@ -713,3 +741,193 @@ impl AllocateLweBootstrapResult for CudaGlweCiphertextList<u128> {
        CudaDynLwe::U128(cuda_lwe)
    }
 }
+
+// Implement LweClassicFft128Bootstrap for CudaNoiseSquashingKey using 128-bit PBS CUDA function
+impl
+    crate::core_crypto::commons::noise_formulas::noise_simulation::traits::LweClassicFft128Bootstrap<
+        CudaDynLwe,
+        CudaDynLwe,
+        CudaGlweCiphertextList<u128>,
+    > for crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey
+{
+    type SideResources = CudaSideResources;
+
+    fn lwe_classic_fft_128_pbs(
+        &self,
+        input: &CudaDynLwe,
+        output: &mut CudaDynLwe,
+        accumulator: &CudaGlweCiphertextList<u128>,
+        side_resources: &mut Self::SideResources,
+    ) {
+        use crate::core_crypto::gpu::algorithms::lwe_programmable_bootstrapping::cuda_programmable_bootstrap_128_lwe_ciphertext_async;
+        use crate::integer::gpu::server_key::CudaBootstrappingKey;
+
+        match (input, output) {
+            (CudaDynLwe::U64(input_cuda_lwe), CudaDynLwe::U128(output_cuda_lwe)) => {
+                // Get the bootstrap key from self - it's already u128 type
+                let bsk = match &self.bootstrapping_key {
+                    CudaBootstrappingKey::Classic(d_bsk) => d_bsk,
+                    CudaBootstrappingKey::MultiBit(_) => {
+                        panic!("MultiBit bootstrapping keys are not supported for 128-bit PBS");
+                    }
+                };
+
+                unsafe {
+                    cuda_programmable_bootstrap_128_lwe_ciphertext_async(
+                        input_cuda_lwe,
+                        output_cuda_lwe,
+                        accumulator,
+                        bsk,
+                        &side_resources.streams,
+                    );
+                    side_resources.streams.synchronize();
+                }
+            }
+            _ => panic!("128-bit PBS expects U64 input and U128 output for CudaDynLwe"),
+        }
+    }
+}
+
+impl AllocateLwePackingKeyswitchResult for CudaLwePackingKeyswitchKey<u64> {
+    type Output = CudaGlweCiphertextList<u64>;
+    type SideResources = CudaSideResources;
+
+    fn allocate_lwe_packing_keyswitch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        let glwe_dimension = self.output_glwe_size().to_glwe_dimension();
+        let polynomial_size = self.output_polynomial_size();
+        let ciphertext_modulus = self.ciphertext_modulus();
+
+        CudaGlweCiphertextList::new(
+            glwe_dimension,
+            polynomial_size,
+            GlweCiphertextCount(1),
+            ciphertext_modulus,
+            &side_resources.streams,
+        )
+    }
+}
+
+impl LwePackingKeyswitch<[&CudaDynLwe], CudaGlweCiphertextList<u64>>
+    for CudaLwePackingKeyswitchKey<u64>
+{
+    type SideResources = CudaSideResources;
+
+    fn keyswitch_lwes_and_pack_in_glwe(
+        &self,
+        input: &[&CudaDynLwe],
+        output: &mut CudaGlweCiphertextList<u64>,
+        side_resources: &mut CudaSideResources,
+    ) {
+        use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64;
+        let input_lwe_ciphertext_list = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list(
+            input.iter().map(|ciphertext| ciphertext.as_lwe_64()),
+            &side_resources.streams,
+        );
+
+        cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_64(
+            self,
+            &input_lwe_ciphertext_list,
+            output,
+            &side_resources.streams,
+        );
+    }
+}
+
+// Implement StandardModSwitch traits for CudaGlweCiphertextList<u64>
+impl AllocateStandardModSwitchResult for CudaGlweCiphertextList<u64> {
+    type Output = Self;
+    type SideResources = CudaSideResources;
+
+    fn allocate_standard_mod_switch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        Self::new(
+            self.glwe_dimension(),
+            self.polynomial_size(),
+            self.glwe_ciphertext_count(),
+            self.ciphertext_modulus(),
+            &side_resources.streams,
+        )
+    }
+}
+
+impl StandardModSwitch<Self> for CudaGlweCiphertextList<u64> {
+    type SideResources = CudaSideResources;
+
+    fn standard_mod_switch(
+        &self,
+        storage_log_modulus: CiphertextModulusLog,
+        output: &mut Self,
+        side_resources: &mut CudaSideResources,
+    ) {
+        let mut internal_output = self.duplicate(&side_resources.streams);
+
+        cuda_modulus_switch_ciphertext(
+            &mut internal_output.0.d_vec,
+            storage_log_modulus.0 as u32,
+            &side_resources.streams,
+        );
+        side_resources.streams.synchronize();
+        let mut cpu_glwe = internal_output.to_glwe_ciphertext_list(&side_resources.streams);
+
+        let shift_to_map_to_native = u64::BITS - storage_log_modulus.0 as u32;
+        for val in cpu_glwe.as_mut_view().into_container().iter_mut() {
+            *val <<= shift_to_map_to_native;
+        }
+        let d_after_ms = Self::from_glwe_ciphertext_list(&cpu_glwe, &side_resources.streams);
+
+        *output = d_after_ms;
+    }
+}
+
+impl AllocateLwePackingKeyswitchResult for CudaLwePackingKeyswitchKey<u128> {
+    type Output = CudaGlweCiphertextList<u128>;
+    type SideResources = CudaSideResources;
+
+    fn allocate_lwe_packing_keyswitch_result(
+        &self,
+        side_resources: &mut Self::SideResources,
+    ) -> Self::Output {
+        let glwe_dimension = self.output_glwe_size().to_glwe_dimension();
+        let polynomial_size = self.output_polynomial_size();
+        let ciphertext_modulus = self.ciphertext_modulus();
+
+        CudaGlweCiphertextList::new(
+            glwe_dimension,
+            polynomial_size,
+            GlweCiphertextCount(1),
+            ciphertext_modulus,
+            &side_resources.streams,
+        )
+    }
+}
+
+impl LwePackingKeyswitch<[&CudaDynLwe], CudaGlweCiphertextList<u128>>
+    for CudaLwePackingKeyswitchKey<u128>
+{
+    type SideResources = CudaSideResources;
+
+    fn keyswitch_lwes_and_pack_in_glwe(
+        &self,
+        input: &[&CudaDynLwe],
+        output: &mut CudaGlweCiphertextList<u128>,
+        side_resources: &mut CudaSideResources,
+    ) {
+        use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_128;
+        let input_lwe_ciphertext_list = CudaLweCiphertextList::from_vec_cuda_lwe_ciphertexts_list(
+            input.iter().map(|ciphertext| ciphertext.as_lwe_128()),
+            &side_resources.streams,
+        );
+
+        cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_128(
+            self,
+            &input_lwe_ciphertext_list,
+            output,
+            &side_resources.streams,
+        );
+    }
+}
--- a/tfhe/src/integer/oprf.rs
+++ b/tfhe/src/integer/oprf.rs
@@ -2,6 +2,7 @@ use super::{RadixCiphertext, ServerKey, SignedRadixCiphertext};
 use crate::core_crypto::commons::generators::DeterministicSeeder;
 use crate::core_crypto::prelude::DefaultRandomGenerator;
 use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
+use std::num::NonZeroU64;

 pub use tfhe_csprng::seeders::{Seed, Seeder};

@@ -163,6 +164,7 @@ impl ServerKey {
    /// as `num_input_random_bits`
    ///
    /// ```rust
+    /// use std::num::NonZeroU64;
    /// use tfhe::integer::gen_keys_radix;
    /// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128;
    /// use tfhe::Seed;
@@ -173,7 +175,7 @@ impl ServerKey {
    /// let (cks, sks) = gen_keys_radix(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128, size);
    ///
    /// let num_input_random_bits = 5;
-    /// let excluded_upper_bound = 3;
+    /// let excluded_upper_bound = NonZeroU64::new(3).unwrap();
    /// let num_blocks_output = 8;
    ///
    /// let ct_res = sks.par_generate_oblivious_pseudo_random_unsigned_custom_range(
@@ -186,15 +188,17 @@ impl ServerKey {
    /// // Decrypt:
    /// let dec_result: u64 = cks.decrypt(&ct_res);
    ///
-    /// assert!(dec_result < excluded_upper_bound);
+    /// assert!(dec_result < excluded_upper_bound.get());
    /// ```
    pub fn par_generate_oblivious_pseudo_random_unsigned_custom_range(
        &self,
        seed: Seed,
        num_input_random_bits: u64,
-        excluded_upper_bound: u64,
+        excluded_upper_bound: NonZeroU64,
        num_blocks_output: u64,
    ) -> RadixCiphertext {
+        let excluded_upper_bound = excluded_upper_bound.get();
+
        assert!(self.message_modulus().0.is_power_of_two());
        let message_bits_count = self.message_modulus().0.ilog2() as u64;

--- a/tfhe/src/integer/server_key/radix_parallel/tests_long_run/test_random_op_sequence.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_long_run/test_random_op_sequence.rs
@@ -10,6 +10,7 @@ use crate::integer::{BooleanBlock, IntegerKeyKind, RadixCiphertext, RadixClientK
 use crate::shortint::parameters::*;
 use crate::{ClientKey, CompressedServerKey, MatchValues, Seed, Tag};
 use std::cmp::{max, min};
+use std::num::NonZeroU64;
 use std::sync::Arc;

 create_parameterized_test!(random_op_sequence {
@@ -498,7 +499,18 @@ where
        &ServerKey::par_generate_oblivious_pseudo_random_unsigned_integer_bounded,
    );
    let oprf_custom_range_executor = OpSequenceCpuFunctionExecutor::new(
-        &ServerKey::par_generate_oblivious_pseudo_random_unsigned_custom_range,
+        &|sk: &ServerKey,
+          seed: Seed,
+          num_input_random_bits: u64,
+          excluded_upper_bound: u64,
+          num_blocks_output: u64| {
+            sk.par_generate_oblivious_pseudo_random_unsigned_custom_range(
+                seed,
+                num_input_random_bits,
+                NonZeroU64::new(excluded_upper_bound).unwrap_or(NonZeroU64::new(1).unwrap()),
+                num_blocks_output,
+            )
+        },
    );

    let mut oprf_ops: Vec<(OprfExecutor, String)> = vec![(
--- a/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_oprf.rs
+++ b/tfhe/src/integer/server_key/radix_parallel/tests_unsigned/test_oprf.rs
@@ -9,6 +9,7 @@ use crate::integer::{IntegerKeyKind, RadixCiphertext, RadixClientKey, ServerKey}
 use crate::shortint::parameters::*;
 use statrs::distribution::ContinuousCDF;
 use std::collections::HashMap;
+use std::num::NonZeroU64;
 use std::sync::Arc;
 use tfhe_csprng::seeders::Seed;

@@ -36,9 +37,19 @@ fn oprf_any_range_unsigned<P>(param: P)
 where
    P: Into<TestParameters>,
 {
-    let executor = CpuFunctionExecutor::new(
-        &ServerKey::par_generate_oblivious_pseudo_random_unsigned_custom_range,
-    );
+    let executor =
+        CpuFunctionExecutor::new(&|sk: &ServerKey,
+                                   seed: Seed,
+                                   num_input_random_bits: u64,
+                                   excluded_upper_bound: u64,
+                                   num_blocks_output: u64| {
+            sk.par_generate_oblivious_pseudo_random_unsigned_custom_range(
+                seed,
+                num_input_random_bits,
+                NonZeroU64::new(excluded_upper_bound).unwrap(),
+                num_blocks_output,
+            )
+        });
    oprf_any_range_test(param, executor);
 }

@@ -46,9 +57,19 @@ fn oprf_almost_uniformity_unsigned<P>(param: P)
 where
    P: Into<TestParameters>,
 {
-    let executor = CpuFunctionExecutor::new(
-        &ServerKey::par_generate_oblivious_pseudo_random_unsigned_custom_range,
-    );
+    let executor =
+        CpuFunctionExecutor::new(&|sk: &ServerKey,
+                                   seed: Seed,
+                                   num_input_random_bits: u64,
+                                   excluded_upper_bound: u64,
+                                   num_blocks_output: u64| {
+            sk.par_generate_oblivious_pseudo_random_unsigned_custom_range(
+                seed,
+                num_input_random_bits,
+                NonZeroU64::new(excluded_upper_bound).unwrap(),
+                num_blocks_output,
+            )
+        });
    oprf_almost_uniformity_test(param, executor);
 }

@@ -89,7 +110,7 @@ where
    );
 }

-pub fn oprf_uniformity_test<P, E>(param: P, mut executor: E)
+pub(crate) fn oprf_uniformity_test<P, E>(param: P, mut executor: E)
 where
    P: Into<TestParameters>,
    E: for<'a> FunctionExecutor<(Seed, u64, u64), RadixCiphertext>,
@@ -113,7 +134,7 @@ where
    });
 }

-pub fn oprf_any_range_test<P, E>(param: P, mut executor: E)
+pub(crate) fn oprf_any_range_test<P, E>(param: P, mut executor: E)
 where
    P: Into<TestParameters>,
    E: for<'a> FunctionExecutor<(Seed, u64, u64, u64), RadixCiphertext>,
@@ -149,7 +170,7 @@ where
    }
 }

-pub fn oprf_almost_uniformity_test<P, E>(param: P, mut executor: E)
+pub(crate) fn oprf_almost_uniformity_test<P, E>(param: P, mut executor: E)
 where
    P: Into<TestParameters>,
    E: for<'a> FunctionExecutor<(Seed, u64, u64, u64), RadixCiphertext>,
@@ -165,40 +186,70 @@ where
    let num_input_random_bits: u64 = 4;
    let num_blocks_output = 64;
    let excluded_upper_bound = 10;
-    let random_input_upper_bound = 1 << num_input_random_bits;
-
-    let mut density = vec![0_usize; excluded_upper_bound as usize];
-    for i in 0..random_input_upper_bound {
-        let index = ((i * excluded_upper_bound) as f64 / random_input_upper_bound as f64) as usize;
-        density[index] += 1;
-    }
-
-    let theoretical_pdf: Vec<f64> = density
-        .iter()
-        .map(|count| *count as f64 / random_input_upper_bound as f64)
-        .collect();

    let values: Vec<u64> = (0..sample_count)
        .map(|seed| {
            let img = executor.execute((
                Seed(seed as u128),
                num_input_random_bits,
-                excluded_upper_bound as u64,
+                excluded_upper_bound,
                num_blocks_output,
            ));
            cks.decrypt(&img)
        })
        .collect();

+    let p_value_upper_bound = p_value_upper_bound_oprf_almost_uniformity_from_values(
+        &values,
+        num_input_random_bits,
+        excluded_upper_bound,
+    );
+
+    assert!(p_value_limit < p_value_upper_bound);
+}
+
+pub(crate) fn p_value_upper_bound_oprf_almost_uniformity_from_values(
+    values: &[u64],
+    num_input_random_bits: u64,
+    excluded_upper_bound: u64,
+) -> f64 {
+    let density = oprf_density_function(excluded_upper_bound, num_input_random_bits);
+
+    let theoretical_pdf = probability_density_function_from_density(&density);
+
    let mut bins = vec![0_u64; excluded_upper_bound as usize];
-    for value in values {
+    for value in values.iter().copied() {
        bins[value as usize] += 1;
    }

    let cumulative_bins = cumulate(&bins);
    let theoretical_cdf = cumulate(&theoretical_pdf);
    let sup_diff = sup_diff(&cumulative_bins, &theoretical_cdf);
-    let p_value_upper_bound = dkw_alpha_from_epsilon(sample_count as f64, sup_diff);

-    assert!(p_value_limit < p_value_upper_bound);
+    dkw_alpha_from_epsilon(values.len() as f64, sup_diff)
+}
+
+pub(crate) fn oprf_density_function(
+    excluded_upper_bound: u64,
+    num_input_random_bits: u64,
+) -> Vec<usize> {
+    let random_input_upper_bound = 1 << num_input_random_bits;
+
+    let mut density = vec![0_usize; excluded_upper_bound as usize];
+
+    for i in 0..random_input_upper_bound {
+        let output = ((i * excluded_upper_bound) >> num_input_random_bits) as usize;
+
+        density[output] += 1;
+    }
+    density
+}
+
+pub(crate) fn probability_density_function_from_density(density: &[usize]) -> Vec<f64> {
+    let total_count: usize = density.iter().copied().sum();
+
+    density
+        .iter()
+        .map(|count| *count as f64 / total_count as f64)
+        .collect()
 }
--- a/tfhe/src/shortint/oprf.rs
+++ b/tfhe/src/shortint/oprf.rs
@@ -475,8 +475,12 @@ pub(crate) mod test {
        }
    }

-    pub fn test_uniformity<F>(sample_count: usize, p_value_limit: f64, distinct_values: u64, f: F)
-    where
+    pub(crate) fn test_uniformity<F>(
+        sample_count: usize,
+        p_value_limit: f64,
+        distinct_values: u64,
+        f: F,
+    ) where
        F: Sync + Fn(usize) -> u64,
    {
        let p_value = uniformity_p_value(f, sample_count, distinct_values);
@@ -487,7 +491,7 @@ pub(crate) mod test {
        );
    }

-    fn uniformity_p_value<F>(f: F, sample_count: usize, distinct_values: u64) -> f64
+    pub(crate) fn uniformity_p_value<F>(f: F, sample_count: usize, distinct_values: u64) -> f64
    where
        F: Sync + Fn(usize) -> u64,
    {
@@ -495,8 +499,11 @@ pub(crate) mod test {

        let mut values_count = HashMap::new();

-        for i in &values {
-            assert!(*i < distinct_values, "i {} dv{}", *i, distinct_values);
+        for i in values.iter().copied() {
+            assert!(
+                i < distinct_values,
+                "i (={i}) is supposed to be smaller than distinct_values (={distinct_values})",
+            );

            *values_count.entry(i).or_insert(0) += 1;
        }
--- a/tfhe/src/shortint/server_key/tests/noise_distribution/dp_ks_pbs128_packingks.rs
+++ b/tfhe/src/shortint/server_key/tests/noise_distribution/dp_ks_pbs128_packingks.rs
@@ -27,7 +27,7 @@ use crate::shortint::server_key::ServerKey;
 use rayon::prelude::*;

 #[allow(clippy::too_many_arguments)]
-fn dp_ks_any_ms_standard_pbs128<
+pub fn dp_ks_any_ms_standard_pbs128<
    InputCt,
    ScalarMulResult,
    KsResult,
@@ -111,7 +111,7 @@ where

 #[allow(clippy::too_many_arguments)]
 #[allow(clippy::type_complexity)]
-fn dp_ks_any_ms_standard_pbs128_packing_ks<
+pub fn dp_ks_any_ms_standard_pbs128_packing_ks<
    InputCt,
    ScalarMulResult,
    KsResult,
--- a/tfhe/web_wasm_parallel_tests/worker.js
+++ b/tfhe/web_wasm_parallel_tests/worker.js
@@ -727,8 +727,15 @@ async function compactPublicKeyZeroKnowledgeBench() {
            serialized_size = list.safe_serialize(BigInt(10000000)).length;
          }
          const mean = timing / bench_loops;
+
+          let base_bench_str = "compact_fhe_uint_proven_encryption_";
+          let supportsThreads = await threads();
+          if (!supportsThreads) {
+            base_bench_str += "unsafe_coop_";
+          }
+
          const common_bench_str =
-            "compact_fhe_uint_proven_encryption_" +
+            base_bench_str +
            params.zk_scheme +
            "_" +
            bits_to_encrypt +
Author	SHA1	Message	Date
Theo Souchon	5f6e1fe648	chore(bench): add missing operation in hlapi	2026-01-12 14:58:08 +01:00
Arthur Meyre	cb4d62b40a	chore: fix wasm-pack URL and update build output listing Corrected the URL for 'wasm-pack' and updated the file listing after the build. co-authored-by: d4wae89d498 <faussurier.marc@icloud.com>	2026-01-12 12:51:04 +01:00
David Testé	7a0c054095	chore(bench): use ks32 parameters set as default only for cpu	2026-01-12 11:00:52 +01:00
Agnes Leroy	ddb7d56f56	chore(gpu): add neg to dedup ops	2026-01-12 11:00:52 +01:00
Guillermo Oyarzun	cbe39c8e98	feat(gpu): create noise and pfail tests pbs128 and packingks	2026-01-12 10:46:41 +01:00
pgardratzama	27364857f1	fix(hpu): prf is not available yet on HPU	2026-01-12 09:55:18 +01:00
Arthur Meyre	7043246c17	chore: update CODEOWNERS file	2026-01-09 16:12:50 +01:00
Theo Souchon	51735fb8ed	chore(bench): code refactor and automation for hlapi	2026-01-09 16:09:27 +01:00
pgardratzama	23a348c9ae	feat(hpu): new HPU bitstream RTL v2.2	2026-01-09 15:25:35 +01:00
Mayeul@Zama	61b616b784	chore(hlapi): add bench of oprf over any range	2026-01-09 15:19:08 +01:00
Mayeul@Zama	df48e176f3	feat(hlapi): add oprf over any range	2026-01-09 15:19:08 +01:00
Mayeul@Zama	dd2345df6b	refactor(integer): use NonZeroU64 for excluded_upper_bound	2026-01-09 15:19:08 +01:00
Mayeul@Zama	933800ea6f	doc(hlapi): fix documentation	2026-01-09 15:19:08 +01:00
Mayeul@Zama	3e4cee3a75	refactor(integer): split oprf_almost_uniformity_test	2026-01-09 15:19:08 +01:00
Mayeul@Zama	00ea9b8e07	refactor(shortint): improve error in uniformity_p_value	2026-01-09 15:19:08 +01:00
Mayeul@Zama	23ce85f6a2	fix(core): make sup_diff more permissive	2026-01-09 15:19:08 +01:00
Nicolas Sarlin	126a95e929	fix(js): unsafe coop bench was overwritting mt one	2026-01-08 16:48:18 +01:00
Nicolas Sarlin	23fffb1443	chore(deps): ignore unmaintained bincode cargo audit warning	2026-01-08 15:16:37 +01:00
Agnes Leroy	6d58a54266	chore(gpu): attempt to fix apt in ci	2026-01-08 14:54:03 +01:00
Baptiste Roux	9b8d5f5a43	chore(hpu): bump version of lru Lru required version update following caro audit Signed-off-by: Baptiste Roux <baptiste.roux@zama.ai>	2026-01-08 14:08:31 +01:00