chore(gpu): add missing syncs in linearalgebra functions and aes

2026-01-09 22:57:59 -05:00 · 2025-10-13 15:26:10 +02:00
parent c3ed1a7558
commit cf3f25efdd
6 changed files with 167 additions and 149 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -79,6 +79,7 @@ template <typename Torus> struct int_aes_lut_buffers {
    this->carry_lut->release(streams);
    delete this->carry_lut;
    this->carry_lut = nullptr;
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
 };

@@ -140,6 +141,7 @@ template <typename Torus> struct int_aes_round_workspaces {
                                   allocate_gpu_memory);
    delete this->vec_tmp_bit_buffer;
    this->vec_tmp_bit_buffer = nullptr;
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
 };

@@ -206,12 +208,12 @@ template <typename Torus> struct int_aes_counter_workspaces {
    delete this->vec_trivial_b_bits_buffer;
    this->vec_trivial_b_bits_buffer = nullptr;

-    free(this->h_counter_bits_buffer);
    if (allocate_gpu_memory) {
      cuda_drop_async(this->d_counter_bits_buffer, streams.stream(0),
                      streams.gpu_index(0));
-      streams.synchronize();
    }
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
+    free(this->h_counter_bits_buffer);
  }
 };

@@ -303,6 +305,7 @@ template <typename Torus> struct int_aes_main_workspaces {
                                   allocate_gpu_memory);
    delete this->batch_processing_buffer;
    this->batch_processing_buffer = nullptr;
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
 };

@@ -366,6 +369,7 @@ template <typename Torus> struct int_aes_encrypt_buffer {
    main_workspaces->release(streams, allocate_gpu_memory);
    delete main_workspaces;
    main_workspaces = nullptr;
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
 };

@@ -434,6 +438,7 @@ template <typename Torus> struct int_key_expansion_buffer {

    this->aes_encrypt_buffer->release(streams);
    delete this->aes_encrypt_buffer;
+    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
 };

--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
@@ -53,6 +53,7 @@ __host__ void host_scalar_addition_inplace(
  for (uint i = 0; i < num_scalars; i++) {
    lwe_array->degrees[i] = lwe_array->degrees[i] + h_scalar_input[i];
  }
+  cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
 }

 template <typename Torus>
@@ -93,6 +94,7 @@ __host__ void host_add_scalar_one_inplace(CudaStreams streams,
  for (uint i = 0; i < lwe_array->num_radix_blocks; i++) {
    lwe_array->degrees[i] = lwe_array->degrees[i] + 1;
  }
+  cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
 }

 template <typename Torus>
@@ -134,5 +136,6 @@ __host__ void host_scalar_subtraction_inplace(
                                             input_lwe_ciphertext_count,
                                             lwe_dimension, delta);
  check_cuda_error(cudaGetLastError());
+  cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
 }
 #endif
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh
@@ -2,15 +2,12 @@
 #define CUDA_ADD_CUH

 #ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
 #endif

 #include "device.h"
 #include "helper_multi_gpu.h"
 #include "integer/integer.h"
 #include "integer/integer_utilities.h"
-#include "linear_algebra.h"
 #include "utils/kernel_dimensions.cuh"
 #include <stdio.h>

@@ -65,6 +62,7 @@ __host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index,
  plaintext_addition<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
+  cuda_synchronize_stream(stream, gpu_index);
 }

 template <typename T>
@@ -86,6 +84,7 @@ __host__ void host_addition_plaintext_scalar(
  plaintext_addition_scalar<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, plaintext_input, lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
+  cuda_synchronize_stream(stream, gpu_index);
 }

 template <typename T>
@@ -139,6 +138,7 @@ host_addition(cudaStream_t stream, uint32_t gpu_index,
        input_1->noise_levels[i] + input_2->noise_levels[i];
    CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
  }
+  cuda_synchronize_stream(stream, gpu_index);
 }

 template <typename T>
@@ -200,46 +200,6 @@ __host__ void host_add_the_same_block_to_all_blocks(
  }
 }

-template <typename T>
-__global__ void pack_for_overflowing_ops(T *output, T const *input_1,
-                                         T const *input_2, uint32_t num_entries,
-                                         uint32_t message_modulus) {
-
-  int tid = threadIdx.x;
-  int index = blockIdx.x * blockDim.x + tid;
-  if (index < num_entries) {
-    // Here we take advantage of the wrapping behaviour of uint
-    output[index] = input_1[index] * message_modulus + input_2[index];
-  }
-}
-
-template <typename T>
-__host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
-                                            uint32_t gpu_index, T *output,
-                                            T const *input_1, T const *input_2,
-                                            uint32_t input_lwe_dimension,
-                                            uint32_t input_lwe_ciphertext_count,
-                                            uint32_t message_modulus) {
-
-  cuda_set_device(gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = input_lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  pack_for_overflowing_ops<T><<<grid, thds, 0, stream>>>(
-      &output[(input_lwe_ciphertext_count - 1) * lwe_size],
-      &input_1[(input_lwe_ciphertext_count - 1) * lwe_size],
-      &input_2[(input_lwe_ciphertext_count - 1) * lwe_size], lwe_size,
-      message_modulus);
-  check_cuda_error(cudaGetLastError());
-}
-
 template <typename T>
 __global__ void subtraction(T *output, T const *input_1, T const *input_2,
                            uint32_t num_entries) {
@@ -273,6 +233,7 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
  subtraction<T>
      <<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
  check_cuda_error(cudaGetLastError());
+  cuda_synchronize_stream(stream, gpu_index);
 }

 template <typename T>
@@ -312,6 +273,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
  radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
      output, plaintext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
+  cuda_synchronize_stream(stream, gpu_index);
 }

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -50,6 +50,7 @@ __host__ void host_cleartext_vec_multiplication(
  cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
  check_cuda_error(cudaGetLastError());
+  cuda_synchronize_stream(stream, gpu_index);
 }

 template <typename T>
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh
@@ -39,6 +39,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,

  negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
  check_cuda_error(cudaGetLastError());
+  cuda_synchronize_stream(stream, gpu_index);
 }

 #endif // CUDA_NEGATE_H
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -304,8 +304,9 @@ where
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_scalar_addition_assign<T: UnsignedInteger>(
    streams: &CudaStreams,
    lwe_array: &mut CudaRadixCiphertext,
@@ -356,8 +357,9 @@ pub(crate) unsafe fn cuda_backend_scalar_addition_assign<T: UnsignedInteger>(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_scalar_mul<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    lwe_array: &mut CudaRadixCiphertext,
@@ -716,8 +718,9 @@ where
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_compress<
    InputTorus: UnsignedInteger,
    OutputTorus: UnsignedInteger,
@@ -869,8 +872,9 @@ pub(crate) fn cuda_backend_get_compression_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_decompress<B: Numeric>(
    streams: &CudaStreams,
    lwe_array_out: &mut CudaLweCiphertextList<u64>,
@@ -950,8 +954,9 @@ pub(crate) unsafe fn cuda_backend_decompress<B: Numeric>(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 ///
 ///  128-bit decompression doesn't execute a PBS as the 64-bit does.
 ///  We have a different entry point because we don't need to carry a bsk to the backend.
@@ -1059,8 +1064,9 @@ pub(crate) fn cuda_backend_get_decompression_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_add_assign(
    streams: &CudaStreams,
    radix_lwe_left: &mut CudaRadixCiphertext,
@@ -1133,8 +1139,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_add_assign(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_mul_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_lwe_left: &mut CudaRadixCiphertext,
@@ -1318,8 +1325,9 @@ pub(crate) fn cuda_backend_get_mul_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_bitop_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_lwe_left: &mut CudaRadixCiphertext,
@@ -1499,8 +1507,9 @@ pub(crate) fn cuda_backend_get_bitop_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_scalar_bitop_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_lwe: &mut CudaRadixCiphertext,
@@ -1655,8 +1664,9 @@ pub(crate) fn cuda_backend_get_scalar_bitop_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_comparison<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_lwe_out: &mut CudaRadixCiphertext,
@@ -1863,8 +1873,9 @@ pub(crate) fn cuda_backend_get_comparison_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_scalar_comparison<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_lwe_out: &mut CudaRadixCiphertext,
@@ -2003,8 +2014,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_comparison<T: UnsignedInteger
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_full_propagate_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_lwe_input: &mut CudaRadixCiphertext,
@@ -2141,8 +2153,9 @@ pub(crate) fn cuda_backend_get_full_propagate_assign_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_propagate_single_carry_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_lwe_input: &mut CudaRadixCiphertext,
@@ -2369,8 +2382,9 @@ pub(crate) fn cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_sub_and_propagate_single_carry_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -2539,8 +2553,9 @@ pub(crate) unsafe fn cuda_backend_sub_and_propagate_single_carry_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_add_and_propagate_single_carry_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -2700,8 +2715,9 @@ pub(crate) unsafe fn cuda_backend_add_and_propagate_single_carry_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_grouped_oprf<B: Numeric>(
    streams: &CudaStreams,
    radix_lwe_out: &mut CudaRadixCiphertext,
@@ -2838,8 +2854,9 @@ pub(crate) fn cuda_backend_get_grouped_oprf_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_rem<
    T: UnsignedInteger,
    B: Numeric,
@@ -3031,8 +3048,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_rem<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_rem_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -3422,8 +3440,9 @@ where
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -3595,8 +3614,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -3736,8 +3756,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_scalar_left_shift_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -3831,8 +3852,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_left_shift_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_scalar_logical_right_shift_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -3926,8 +3948,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_logical_right_shift_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_scalar_arithmetic_right_shift_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -4020,8 +4043,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_arithmetic_right_shift_assign
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_right_shift_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_input: &mut CudaRadixCiphertext,
@@ -4137,8 +4161,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_right_shift_assign<T: UnsignedIntege
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_left_shift_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_input: &mut CudaRadixCiphertext,
@@ -4254,8 +4279,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_left_shift_assign<T: UnsignedInteger
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_rotate_right_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_input: &mut CudaRadixCiphertext,
@@ -4376,8 +4402,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_rotate_right_assign<T: UnsignedInteg
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_rotate_left_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_input: &mut CudaRadixCiphertext,
@@ -4849,8 +4876,9 @@ pub(crate) fn cuda_backend_get_rotate_left_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_cmux<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    radix_lwe_out: &mut CudaRadixCiphertext,
@@ -5088,8 +5116,9 @@ pub(crate) fn cuda_backend_get_cmux_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_left_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -5185,8 +5214,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_left_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_right_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -5380,8 +5410,9 @@ pub(crate) fn get_scalar_rotate_right_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_partial_sum_ciphertexts_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -5489,8 +5520,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_partial_sum_ciphertexts_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_extend_radix_with_sign_msb<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    output: &mut CudaRadixCiphertext,
@@ -5564,8 +5596,9 @@ pub(crate) unsafe fn cuda_backend_extend_radix_with_sign_msb<T: UnsignedInteger,
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_apply_univariate_lut<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    output: &mut CudaSliceMut<T>,
@@ -5670,8 +5703,9 @@ pub(crate) unsafe fn cuda_backend_apply_univariate_lut<T: UnsignedInteger, B: Nu
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_apply_many_univariate_lut<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    output: &mut CudaSliceMut<T>,
@@ -5780,8 +5814,9 @@ pub(crate) unsafe fn cuda_backend_apply_many_univariate_lut<T: UnsignedInteger,
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_apply_bivariate_lut<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    output: &mut CudaSliceMut<T>,
@@ -5905,8 +5940,9 @@ pub(crate) unsafe fn cuda_backend_apply_bivariate_lut<T: UnsignedInteger, B: Num
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_div_rem_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    quotient: &mut CudaRadixCiphertext,
@@ -6107,9 +6143,9 @@ pub(crate) fn cuda_backend_get_div_rem_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
-///   be dropped until streams is synchronized.
-/// - `output_ct` must be allocated with enough blocks to store the result.
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_count_of_consecutive_bits<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    output_ct: &mut CudaRadixCiphertext,
@@ -6221,9 +6257,9 @@ pub(crate) unsafe fn cuda_backend_count_of_consecutive_bits<T: UnsignedInteger,
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
-///   be dropped until streams is synchronized.
-/// - `output_ct` must be allocated with enough blocks to store the result.
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_ilog2<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    output: &mut CudaRadixCiphertext,
@@ -6372,8 +6408,9 @@ pub(crate) unsafe fn cuda_backend_ilog2<T: UnsignedInteger, B: Numeric>(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_compute_prefix_sum_hillis_steele<
    T: UnsignedInteger,
    B: Numeric,
@@ -6489,8 +6526,9 @@ pub(crate) unsafe fn cuda_backend_compute_prefix_sum_hillis_steele<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_unsigned_overflowing_sub_assign<
    T: UnsignedInteger,
    B: Numeric,
@@ -6644,8 +6682,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_overflowing_sub_assign<
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_signed_abs_assign<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    ct: &mut CudaRadixCiphertext,
@@ -6729,8 +6768,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_signed_abs_assign<T: UnsignedInteger
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_is_at_least_one_comparisons_block_true<
    T: UnsignedInteger,
    B: Numeric,
@@ -6859,8 +6899,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_is_at_least_one_comparisons_block_tr
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_are_all_comparisons_block_true<
    T: UnsignedInteger,
    B: Numeric,
@@ -6987,12 +7028,11 @@ pub(crate) unsafe fn cuda_backend_unchecked_are_all_comparisons_block_true<
 }

 #[allow(clippy::too_many_arguments)]
-/// Assign negation of a vector of LWE ciphertexts representing an integer
-///
 /// # Safety
 ///
-/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
-/// required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_negate(
    streams: &CudaStreams,
    radix_lwe_out: &mut CudaRadixCiphertext,
@@ -7048,8 +7088,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_negate(

 /// # Safety
 ///
-/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
-///   be dropped until streams is synchronized
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_trim_radix_blocks_lsb(
    output: &mut CudaRadixCiphertext,
    input: &CudaRadixCiphertext,
@@ -7075,8 +7116,9 @@ pub(crate) unsafe fn cuda_backend_trim_radix_blocks_lsb(

 /// # Safety
 ///
-/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
-///   be dropped until streams is synchronized
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_extend_radix_with_trivial_zero_blocks_msb(
    output: &mut CudaRadixCiphertext,
    input: &CudaRadixCiphertext,
@@ -7104,8 +7146,9 @@ pub(crate) unsafe fn cuda_backend_extend_radix_with_trivial_zero_blocks_msb(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_noise_squashing<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    output: &mut CudaSliceMut<T>,
@@ -7215,8 +7258,9 @@ pub(crate) unsafe fn cuda_backend_noise_squashing<T: UnsignedInteger, B: Numeric
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
-///   be dropped until stream is synchronised
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 ///
 ///
 /// In this method, the input `lwe_flattened_compact_array_in` represents a flattened compact list.
@@ -7334,8 +7378,9 @@ pub(crate) unsafe fn cuda_backend_expand<T: UnsignedInteger, B: Numeric>(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_unchecked_aes_ctr_encrypt<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    output: &mut CudaRadixCiphertext,
@@ -7479,8 +7524,9 @@ pub(crate) fn cuda_backend_get_aes_ctr_encrypt_size_on_gpu(
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///
-/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
-///   is required
+/// - The data must not be moved or dropped while being used by the CUDA kernel.
+/// - This function assumes exclusive access to the passed data; violating this may lead to
+///   undefined behavior.
 pub(crate) unsafe fn cuda_backend_aes_key_expansion<T: UnsignedInteger, B: Numeric>(
    streams: &CudaStreams,
    expanded_keys: &mut CudaRadixCiphertext,