fix(gpu): add indexes to modulus switch noise reduction

2026-01-10 07:08:03 -05:00 · 2025-05-13 13:33:46 +02:00
parent 45fdba04b1
commit c19cd9f021
10 changed files with 68 additions and 71 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
+++ b/backends/tfhe-cuda-backend/cuda/include/ciphertext.h
@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,

 void cuda_improve_noise_modulus_switch_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
-    uint32_t num_lwes, uint32_t num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus);
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus);

 void cuda_glwe_sample_extract_128(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
@@ -248,6 +248,7 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
  __uint128_t *global_accumulator;
  double *global_join_buffer;
  __uint128_t *temp_lwe_array_in;
+  uint64_t *trivial_indexes;

  PBS_VARIANT pbs_variant;
  bool uses_noise_reduction;
@@ -263,11 +264,27 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
    cuda_set_device(gpu_index);
    this->pbs_variant = pbs_variant;
    this->uses_noise_reduction = allocate_ms_array;
-    this->temp_lwe_array_in =
-        (__uint128_t *)cuda_malloc_with_size_tracking_async(
-            (lwe_dimension + 1) * input_lwe_ciphertext_count *
-                sizeof(__uint128_t),
-            stream, gpu_index, size_tracker, allocate_ms_array);
+    if (allocate_ms_array) {
+      this->temp_lwe_array_in =
+          (__uint128_t *)cuda_malloc_with_size_tracking_async(
+              (lwe_dimension + 1) * input_lwe_ciphertext_count *
+                  sizeof(__uint128_t),
+              stream, gpu_index, size_tracker, allocate_ms_array);
+      this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          size_tracker, allocate_ms_array);
+      uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
+      for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
+        h_trivial_indexes[i] = i;
+
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          trivial_indexes, h_trivial_indexes,
+          input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
+          allocate_gpu_memory);
+
+      cuda_synchronize_stream(stream, gpu_index);
+      delete[] h_trivial_indexes;
+    }
    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
    size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
                                     input_lwe_ciphertext_count *
@@ -404,9 +421,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
      cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
                                         gpu_memory_allocated);

-    if (uses_noise_reduction)
+    if (uses_noise_reduction) {
      cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
+                                         gpu_memory_allocated);
+    }
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cu
@@ -86,13 +86,15 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,

 void cuda_improve_noise_modulus_switch_64(
    void *stream, uint32_t gpu_index, void *lwe_array_out,
-    void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
-    uint32_t num_lwes, uint32_t num_zeros, double input_variance,
-    double r_sigma, double bound, uint32_t log_modulus) {
+    void const *lwe_array_in, void const *lwe_array_indexes,
+    void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
+    uint32_t num_zeros, double input_variance, double r_sigma, double bound,
+    uint32_t log_modulus) {
  host_improve_noise_modulus_switch<uint64_t>(
      static_cast<cudaStream_t>(stream), gpu_index,
      static_cast<uint64_t *>(lwe_array_out),
      static_cast<uint64_t const *>(lwe_array_in),
+      static_cast<uint64_t const *>(lwe_array_indexes),
      static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
      num_zeros, input_variance, r_sigma, bound, log_modulus);
 }
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -178,11 +178,10 @@ __device__ __forceinline__ double measure_modulus_switch_noise(

 // Each thread processes two elements of the lwe array
 template <typename Torus>
-__global__ void
-improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
-                             const Torus *zeros, int lwe_size, int num_zeros,
-                             double input_variance, double r_sigma,
-                             double bound, uint32_t log_modulus) {
+__global__ void improve_noise_modulus_switch(
+    Torus *array_out, const Torus *array_in, const uint64_t *indexes,
+    const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
+    double r_sigma, double bound, uint32_t log_modulus) {

  // First we will assume size is less than the number of threads per block
  // I should switch this to dynamic shared memory
@@ -198,13 +197,13 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
  // This probably are not needed cause we are setting the values
  sum_mask_errors[threadIdx.x] = 0.f;
  sum_squared_mask_errors[threadIdx.x] = 0.f;
+  auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
+  auto this_block_lwe_out = array_out + indexes[blockIdx.x] * lwe_size;
+  Torus input_element1 = this_block_lwe_in[threadIdx.x];

-  Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
-
-  Torus input_element2 =
-      threadIdx.x + blockDim.x < lwe_size
-          ? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
-          : 0;
+  Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
+                             ? this_block_lwe_in[threadIdx.x + blockDim.x]
+                             : 0;

  // Base noise is only handled by thread 0
  double base_noise = measure_modulus_switch_noise<Torus>(
@@ -218,11 +217,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
  __syncthreads();

  if (found)
-    array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
+    this_block_lwe_out[threadIdx.x] = input_element1;

  if (found && (threadIdx.x + blockDim.x) < lwe_size)
-    array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
-        input_element2;
+    this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;

  __syncthreads();
  // If we found a zero element we stop iterating (in avg 20 times are
@@ -253,11 +251,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
    // Assumption we always have at least 512 elements
    // If we find a useful zero encryption we replace the lwe by lwe + zero
    if (found)
-      array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
+      this_block_lwe_out[threadIdx.x] = zero_element1;

    if (found && (threadIdx.x + blockDim.x) < lwe_size)
-      array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
-          zero_element2;
+      this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;

    __syncthreads();
    // If we found a zero element we stop iterating (in avg 20 times are
@@ -270,9 +267,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
 template <typename Torus>
 __host__ void host_improve_noise_modulus_switch(
    cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
-    Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
-    uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
-    const double r_sigma, const double bound, uint32_t log_modulus) {
+    Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
+    uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
+    const double input_variance, const double r_sigma, const double bound,
+    uint32_t log_modulus) {

  if (lwe_size < 512) {
    PANIC("The lwe_size is less than 512, this is not supported\n");
@@ -289,8 +287,8 @@ __host__ void host_improve_noise_modulus_switch(
  int num_threads = 512, num_blocks = num_lwes;

  improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
-      array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
-      bound, log_modulus);
+      array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
+      r_sigma, bound, log_modulus);
  check_cuda_error(cudaGetLastError());
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -194,7 +194,8 @@ void execute_pbs_async(
            lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);

        void *zeros = nullptr;
-        if (ms_noise_reduction_key != nullptr)
+        if (ms_noise_reduction_key != nullptr &&
+            ms_noise_reduction_key->ptr != nullptr)
          zeros = ms_noise_reduction_key->ptr[i];
        cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
            streams[i], gpu_indexes[i], current_lwe_array_out,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -660,13 +660,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;

  // If the parameters contain noise reduction key, then apply it
-  if (ms_noise_reduction_key != nullptr) {
+  if (ms_noise_reduction_key != nullptr &&
+      ms_noise_reduction_key->ptr != nullptr) {
    if (ms_noise_reduction_key->num_zeros != 0) {
      uint32_t log_modulus = log2(polynomial_size) + 1;
      host_improve_noise_modulus_switch<uint64_t>(
          static_cast<cudaStream_t>(stream), gpu_index,
          buffer->temp_lwe_array_in,
          static_cast<uint64_t const *>(lwe_array_in),
+          static_cast<uint64_t const *>(lwe_input_indexes),
          static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
          num_samples, ms_noise_reduction_key->num_zeros,
          ms_noise_reduction_key->ms_input_variance,
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic_128.cu
@@ -256,6 +256,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
        static_cast<cudaStream_t>(stream), gpu_index,
        static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
        static_cast<__uint128_t const *>(lwe_array_in),
+        static_cast<uint64_t const *>(buffer->trivial_indexes),
        static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
        lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
        ms_noise_reduction_key->ms_input_variance,
--- a/backends/tfhe-cuda-backend/src/bindings.rs
+++ b/backends/tfhe-cuda-backend/src/bindings.rs
@@ -50,6 +50,7 @@ unsafe extern "C" {
        gpu_index: u32,
        lwe_array_out: *mut ffi::c_void,
        lwe_array_in: *const ffi::c_void,
+        lwe_array_indexes: *const ffi::c_void,
        encrypted_zeros: *const ffi::c_void,
        lwe_size: u32,
        num_lwes: u32,
--- a/tfhe/src/core_crypto/gpu/algorithms/test/modulus_switch_noise_reduction.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/modulus_switch_noise_reduction.rs
@@ -1,7 +1,7 @@
 use super::super::test::TestResources;
 use crate::core_crypto::commons::test_tools::{check_both_ratio_under, mean, variance};
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
-use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::gpu::{CudaStreams, CudaVec};
 use crate::core_crypto::prelude::*;

 use crate::core_crypto::gpu::GpuIndex;
@@ -147,6 +147,10 @@ fn check_noise_improve_modulus_switch_noise(

    let gpu_index = 0;
    let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
+    let num_blocks = 1;
+    let lwe_indexes: Vec<u64> = (0..num_blocks).map(|x| x as u64).collect();
+    let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &streams, 0) };
+    unsafe { d_input_indexes.copy_from_cpu_async(&lwe_indexes, &streams, 0) };

    let d_encryptions_of_zero = CudaLweCiphertextList::from_lwe_ciphertext_list(
        &encryptions_of_zero,
@@ -186,6 +190,7 @@ fn check_noise_improve_modulus_switch_noise(
                            streams.gpu_indexes[0].get(),
                            d_ct.0.d_vec.as_mut_c_ptr(0),
                            d_ct_in.0.d_vec.as_c_ptr(0),
+                            d_input_indexes.as_c_ptr(0),
                            d_encryptions_of_zero.0.d_vec.as_c_ptr(0),
                            lwe_dimension.to_lwe_size().0 as u32,
                            d_ct.lwe_ciphertext_count().0 as u32,
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -606,40 +606,6 @@ pub unsafe fn cuda_modulus_switch_ciphertext_async<T: UnsignedInteger>(
    );
 }

-/// # Safety
-///
-/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
-/// required
-#[allow(clippy::too_many_arguments)]
-pub unsafe fn cuda_improve_noise_modulus_switch_ciphertext_async<T: UnsignedInteger>(
-    streams: &CudaStreams,
-    lwe_array_out: &mut CudaVec<T>,
-    lwe_array_in: &CudaVec<T>,
-    encrypted_zeros: &CudaVec<T>,
-    lwe_dimension: LweDimension,
-    num_samples: u32,
-    num_zeros: u32,
-    input_variance: f64,
-    r_sigma_factor: f64,
-    bound: f64,
-    log_modulus: u32,
-) {
-    cuda_improve_noise_modulus_switch_64(
-        streams.ptr[0],
-        streams.gpu_indexes[0].get(),
-        lwe_array_out.as_mut_c_ptr(0),
-        lwe_array_in.as_c_ptr(0),
-        encrypted_zeros.as_c_ptr(0),
-        lwe_dimension.to_lwe_size().0 as u32,
-        num_samples,
-        num_zeros,
-        input_variance,
-        r_sigma_factor,
-        bound,
-        log_modulus,
-    );
-}
-
 /// Addition of a vector of LWE ciphertexts
 ///
 /// # Safety