chore(gpu): stop using optional arguments altogether

chore(gpu): remove device synchronization in drop for CudaVec
2026-01-11 07:38:08 -05:00 · 2024-09-02 15:51:45 +02:00 · 2024-09-02 15:09:08 +02:00
13 changed files with 79 additions and 50 deletions
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/gadget.cuh
@@ -27,7 +27,7 @@ private:

 public:
  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
-                          uint32_t num_poly = 1)
+                          uint32_t num_poly)
      : base_log(base_log), level_count(level_count), num_poly(num_poly),
        state(state) {

--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -271,7 +271,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
    if (!ch_amount)
      ch_amount++;
    dim3 add_grid(ch_amount, num_blocks, 1);
-    size_t sm_size = big_lwe_size * sizeof(Torus);

    cudaSetDevice(gpu_indexes[0]);
    tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -207,9 +207,9 @@ __global__ void device_programmable_bootstrap_amortized(
  // the resulting constant coefficient of the accumulator
  // For the mask it's more complicated
  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
-                                     glwe_dimension);
+                                     glwe_dimension, 0);
  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
-                                     glwe_dimension);
+                                     glwe_dimension, 0);
 }

 template <typename Torus>
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -98,8 +98,8 @@ __global__ void device_programmable_bootstrap_cg(

  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                        params::degree / params::opt>(
-      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-      false);
+      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, false,
+      1);

  for (int i = 0; i < lwe_dimension; i++) {
    synchronize_threads_in_block();
@@ -111,13 +111,13 @@ __global__ void device_programmable_bootstrap_cg(
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
        Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat);
+        accumulator, accumulator_rotated, a_hat, 1);

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count);
+        accumulator_rotated, base_log, level_count, 1);

    synchronize_threads_in_block();

@@ -125,7 +125,7 @@ __global__ void device_programmable_bootstrap_cg(
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated);
+                                           accumulator_rotated, 1);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

    // We are using the same memory space for accumulator_fft and
@@ -150,9 +150,9 @@ __global__ void device_programmable_bootstrap_cg(
    // Perform a sample extract. At this point, all blocks have the result, but
    // we do the computation at block 0 to avoid waiting for extra blocks, in
    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -86,7 +86,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        false, 1);
  } else {
    // Load the accumulator calculated in previous iterations
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -98,12 +98,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator, base_log, level_count);
+        accumulator, base_log, level_count, 1);

    // Decompose the accumulator. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
+                                           1);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

    // We are using the same memory space for accumulator_fft and
@@ -129,9 +130,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
+                                         0);
    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
+                                         0);
    }
  } else {
    // Load the accumulator calculated in previous iterations
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -82,7 +82,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        false, 1);

    // Persist
    int tid = threadIdx.x;
@@ -102,20 +102,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // Perform ACC * (X^ä - 1)
  multiply_by_monomial_negacyclic_and_sub_polynomial<
      Torus, params::opt, params::degree / params::opt>(global_slice,
-                                                        accumulator, a_hat);
+                                                        accumulator, a_hat, 1);

  // Perform a rounding to increase the accuracy of the
  // bootstrapped ciphertext
  round_to_closest_multiple_inplace<Torus, params::opt,
                                    params::degree / params::opt>(
-      accumulator, base_log, level_count);
+      accumulator, base_log, level_count, 1);

  synchronize_threads_in_block();

  // Decompose the accumulator. Each block gets one level of the
  // decomposition, for the mask and the body (so block 0 will have the
  // accumulator decomposed at level 0, 1 at 1, etc.)
-  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
  gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

  // We are using the same memory space for accumulator_fft and
@@ -215,9 +215,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
+                                         0);
    } else if (blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
+                                         0);
    }
  } else {
    // Persist the updated accumulator
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -210,7 +210,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        false, 1);

    // Persist
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -225,12 +225,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
  // bootstrapped ciphertext
  round_to_closest_multiple_inplace<Torus, params::opt,
                                    params::degree / params::opt>(
-      accumulator, base_log, level_count);
+      accumulator, base_log, level_count, 1);

  // Decompose the accumulator. Each block gets one level of the
  // decomposition, for the mask and the body (so block 0 will have the
  // accumulator decomposed at level 0, 1 at 1, etc.)
-  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
  gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

  // We are using the same memory space for accumulator_fft and
@@ -324,9 +324,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
+      sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice, 1,
+                                         0);
    } else if (blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
+      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0,
+                                         0);
    }
  }
 }
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -115,13 +115,13 @@ __global__ void device_programmable_bootstrap_tbc(
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
        Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat);
+        accumulator, accumulator_rotated, a_hat, 1);

    // Perform a rounding to increase the accuracy of the
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count);
+        accumulator_rotated, base_log, level_count, 1);

    synchronize_threads_in_block();

@@ -154,9 +154,9 @@ __global__ void device_programmable_bootstrap_tbc(
    // Perform a sample extract. At this point, all blocks have the result, but
    // we do the computation at block 0 to avoid waiting for extra blocks, in
    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
  }
 }

--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -94,7 +94,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                          params::degree / params::opt>(
        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
+        false, 1);
  } else {
    // Load the accumulator calculated in previous iterations
    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -106,12 +106,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
    // bootstrapped ciphertext
    round_to_closest_multiple_inplace<Torus, params::opt,
                                      params::degree / params::opt>(
-        accumulator, base_log, level_count);
+        accumulator, base_log, level_count, 1);

    // Decompose the accumulator. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
+    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
+                                           1);
    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);

    // We are using the same memory space for accumulator_fft and
@@ -137,9 +138,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
      // Perform a sample extract. At this point, all blocks have the result,
      // but we do the computation at block 0 to avoid waiting for extra blocks,
      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
+      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
+                                         0);
    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
+      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
+                                         0);
    }
  } else {
    // Load the accumulator calculated in previous iterations
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/functions.cuh
@@ -45,7 +45,7 @@ template <typename T, int elems_per_thread, int block_size>
 __device__ void
 divide_by_monomial_negacyclic_inplace(T *accumulator,
                                      const T *__restrict__ input, uint32_t j,
-                                      bool zeroAcc, uint32_t num_poly = 1) {
+                                      bool zeroAcc, uint32_t num_poly) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
@@ -94,7 +94,7 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
 */
 template <typename T, int elems_per_thread, int block_size>
 __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
-    T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
+    T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
@@ -133,7 +133,7 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
 template <typename T, int elems_per_thread, int block_size>
 __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
                                                  int level_count,
-                                                  uint32_t num_poly = 1) {
+                                                  uint32_t num_poly) {
  constexpr int degree = block_size * elems_per_thread;
  for (int z = 0; z < num_poly; z++) {
    T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
@@ -192,7 +192,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
 // Extracts the body of the nth-LWE in a GLWE.
 template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
-                                    uint32_t glwe_dimension, uint32_t nth = 0) {
+                                    uint32_t glwe_dimension, uint32_t nth) {
  // Set first coefficient of the glwe as the body of the LWE sample
  lwe_array_out[glwe_dimension * params::degree] =
      glwe[glwe_dimension * params::degree + nth];
@@ -201,8 +201,7 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
 // Extracts the mask from the nth-LWE in a GLWE.
 template <typename Torus, class params>
 __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
-                                    uint32_t glwe_dimension = 1,
-                                    uint32_t nth = 0) {
+                                    uint32_t glwe_dimension, uint32_t nth) {
  for (int z = 0; z < glwe_dimension; z++) {
    Torus *lwe_array_out_slice =
        (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
--- a/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/glwe_sample_extraction.rs
@@ -5,16 +5,16 @@ use crate::core_crypto::gpu::{extract_lwe_samples_from_glwe_ciphertext_list_asyn
 use crate::core_crypto::prelude::{MonomialDegree, UnsignedTorus};
 use itertools::Itertools;

-/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
-/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
-/// GPU-accelerated.
-pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
+/// # Safety
+///
+/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
+///   be dropped until stream is synchronised
+pub unsafe fn cuda_extract_lwe_samples_from_glwe_ciphertext_list_async<Scalar>(
    input_glwe_list: &CudaGlweCiphertextList<Scalar>,
    output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
    vec_nth: &[MonomialDegree],
    streams: &CudaStreams,
 ) where
-    // CastInto required for PBS modulus switch which returns a usize
    Scalar: UnsignedTorus,
 {
    let in_lwe_dim = input_glwe_list
@@ -58,3 +58,25 @@ pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
        );
    }
 }
+
+/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
+/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
+/// GPU-accelerated.
+pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
+    input_glwe_list: &CudaGlweCiphertextList<Scalar>,
+    output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
+    vec_nth: &[MonomialDegree],
+    streams: &CudaStreams,
+) where
+    Scalar: UnsignedTorus,
+{
+    unsafe {
+        cuda_extract_lwe_samples_from_glwe_ciphertext_list_async(
+            input_glwe_list,
+            output_lwe_list,
+            vec_nth,
+            streams,
+        );
+    }
+    streams.synchronize();
+}
--- a/tfhe/src/core_crypto/gpu/vec.rs
+++ b/tfhe/src/core_crypto/gpu/vec.rs
@@ -1,5 +1,5 @@
 use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
-use crate::core_crypto::gpu::{synchronize_device, CudaStreams};
+use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::Numeric;
 use std::collections::Bound::{Excluded, Included, Unbounded};
 use std::ffi::c_void;
@@ -447,8 +447,6 @@ impl<T: Numeric> Drop for CudaVec<T> {
    /// Free memory for pointer `ptr` synchronously
    fn drop(&mut self) {
        for &gpu_index in self.gpu_indexes.iter() {
-            // Synchronizes the device to be sure no stream is still using this pointer
-            synchronize_device(gpu_index);
            unsafe { cuda_drop(self.get_mut_c_ptr(gpu_index), gpu_index) };
        }
    }
--- a/tfhe/src/integer/gpu/server_key/radix/add.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/add.rs
@@ -370,6 +370,7 @@ impl CudaServerKey {
        let mut result = unsafe { ciphertexts[0].duplicate_async(streams) };

        if ciphertexts.len() == 1 {
+            streams.synchronize();
            return Some(result);
        }
Author	SHA1	Message	Date
Agnes Leroy	7dcbd85a83	chore(gpu): stop using optional arguments altogether	2024-09-02 15:51:45 +02:00
Agnes Leroy	1e453263af	chore(gpu): remove device synchronization in drop for CudaVec	2024-09-02 15:09:08 +02:00