From cf3f25efddfabbc212aa484159a0beabafd0cb00 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Mon, 13 Oct 2025 15:26:10 +0200 Subject: [PATCH] chore(gpu): add missing syncs in linearalgebra functions and aes --- .../cuda/include/aes/aes_utilities.h | 9 +- .../cuda/src/integer/scalar_addition.cuh | 3 + .../cuda/src/linearalgebra/addition.cuh | 48 +--- .../cuda/src/linearalgebra/multiplication.cuh | 1 + .../cuda/src/linearalgebra/negation.cuh | 1 + tfhe/src/integer/gpu/mod.rs | 254 +++++++++++------- 6 files changed, 167 insertions(+), 149 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h index 2ca5e9ed8..61f1d0afc 100644 --- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h @@ -79,6 +79,7 @@ template struct int_aes_lut_buffers { this->carry_lut->release(streams); delete this->carry_lut; this->carry_lut = nullptr; + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } }; @@ -140,6 +141,7 @@ template struct int_aes_round_workspaces { allocate_gpu_memory); delete this->vec_tmp_bit_buffer; this->vec_tmp_bit_buffer = nullptr; + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } }; @@ -206,12 +208,12 @@ template struct int_aes_counter_workspaces { delete this->vec_trivial_b_bits_buffer; this->vec_trivial_b_bits_buffer = nullptr; - free(this->h_counter_bits_buffer); if (allocate_gpu_memory) { cuda_drop_async(this->d_counter_bits_buffer, streams.stream(0), streams.gpu_index(0)); - streams.synchronize(); } + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); + free(this->h_counter_bits_buffer); } }; @@ -303,6 +305,7 @@ template struct int_aes_main_workspaces { allocate_gpu_memory); delete this->batch_processing_buffer; this->batch_processing_buffer = nullptr; + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } }; @@ -366,6 +369,7 @@ template struct int_aes_encrypt_buffer { main_workspaces->release(streams, allocate_gpu_memory); delete main_workspaces; main_workspaces = nullptr; + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } }; @@ -434,6 +438,7 @@ template struct int_key_expansion_buffer { this->aes_encrypt_buffer->release(streams); delete this->aes_encrypt_buffer; + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } }; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh index 0f14d0389..dca548839 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh @@ -53,6 +53,7 @@ __host__ void host_scalar_addition_inplace( for (uint i = 0; i < num_scalars; i++) { lwe_array->degrees[i] = lwe_array->degrees[i] + h_scalar_input[i]; } + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } template @@ -93,6 +94,7 @@ __host__ void host_add_scalar_one_inplace(CudaStreams streams, for (uint i = 0; i < lwe_array->num_radix_blocks; i++) { lwe_array->degrees[i] = lwe_array->degrees[i] + 1; } + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } template @@ -134,5 +136,6 @@ __host__ void host_scalar_subtraction_inplace( input_lwe_ciphertext_count, lwe_dimension, delta); check_cuda_error(cudaGetLastError()); + cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0)); } #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh index 3ce90311f..2ebf3d65f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/addition.cuh @@ -2,15 +2,12 @@ #define CUDA_ADD_CUH #ifdef __CDT_PARSER__ -#undef __CUDA_RUNTIME_H__ -#include #endif #include "device.h" #include "helper_multi_gpu.h" #include "integer/integer.h" #include "integer/integer_utilities.h" -#include "linear_algebra.h" #include "utils/kernel_dimensions.cuh" #include @@ -65,6 +62,7 @@ __host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, plaintext_addition<<>>( output, lwe_input, plaintext_input, lwe_dimension, num_entries); check_cuda_error(cudaGetLastError()); + cuda_synchronize_stream(stream, gpu_index); } template @@ -86,6 +84,7 @@ __host__ void host_addition_plaintext_scalar( plaintext_addition_scalar<<>>( output, lwe_input, plaintext_input, lwe_dimension, num_entries); check_cuda_error(cudaGetLastError()); + cuda_synchronize_stream(stream, gpu_index); } template @@ -139,6 +138,7 @@ host_addition(cudaStream_t stream, uint32_t gpu_index, input_1->noise_levels[i] + input_2->noise_levels[i]; CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus); } + cuda_synchronize_stream(stream, gpu_index); } template @@ -200,46 +200,6 @@ __host__ void host_add_the_same_block_to_all_blocks( } } -template -__global__ void pack_for_overflowing_ops(T *output, T const *input_1, - T const *input_2, uint32_t num_entries, - uint32_t message_modulus) { - - int tid = threadIdx.x; - int index = blockIdx.x * blockDim.x + tid; - if (index < num_entries) { - // Here we take advantage of the wrapping behaviour of uint - output[index] = input_1[index] * message_modulus + input_2[index]; - } -} - -template -__host__ void host_pack_for_overflowing_ops(cudaStream_t stream, - uint32_t gpu_index, T *output, - T const *input_1, T const *input_2, - uint32_t input_lwe_dimension, - uint32_t input_lwe_ciphertext_count, - uint32_t message_modulus) { - - cuda_set_device(gpu_index); - // lwe_size includes the presence of the body - // whereas lwe_dimension is the number of elements in the mask - int lwe_size = input_lwe_dimension + 1; - // Create a 1-dimensional grid of threads - int num_blocks = 0, num_threads = 0; - int num_entries = lwe_size; - getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); - dim3 grid(num_blocks, 1, 1); - dim3 thds(num_threads, 1, 1); - - pack_for_overflowing_ops<<>>( - &output[(input_lwe_ciphertext_count - 1) * lwe_size], - &input_1[(input_lwe_ciphertext_count - 1) * lwe_size], - &input_2[(input_lwe_ciphertext_count - 1) * lwe_size], lwe_size, - message_modulus); - check_cuda_error(cudaGetLastError()); -} - template __global__ void subtraction(T *output, T const *input_1, T const *input_2, uint32_t num_entries) { @@ -273,6 +233,7 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index, subtraction <<>>(output, input_1, input_2, num_entries); check_cuda_error(cudaGetLastError()); + cuda_synchronize_stream(stream, gpu_index); } template @@ -312,6 +273,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream, radix_body_subtraction_inplace<<>>( output, plaintext_input, input_lwe_dimension, num_entries); check_cuda_error(cudaGetLastError()); + cuda_synchronize_stream(stream, gpu_index); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh index e0952e0c2..e0246773b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh @@ -50,6 +50,7 @@ __host__ void host_cleartext_vec_multiplication( cleartext_vec_multiplication<<>>( output, lwe_input, cleartext_input, input_lwe_dimension, num_entries); check_cuda_error(cudaGetLastError()); + cuda_synchronize_stream(stream, gpu_index); } template diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh index 1c6e4321a..47f83a347 100644 --- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/negation.cuh @@ -39,6 +39,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output, negation<<>>(output, input, num_entries); check_cuda_error(cudaGetLastError()); + cuda_synchronize_stream(stream, gpu_index); } #endif // CUDA_NEGATE_H diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index d656fa18f..20dd9ea9d 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -304,8 +304,9 @@ where #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_scalar_addition_assign( streams: &CudaStreams, lwe_array: &mut CudaRadixCiphertext, @@ -356,8 +357,9 @@ pub(crate) unsafe fn cuda_backend_scalar_addition_assign( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_scalar_mul( streams: &CudaStreams, lwe_array: &mut CudaRadixCiphertext, @@ -716,8 +718,9 @@ where #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_compress< InputTorus: UnsignedInteger, OutputTorus: UnsignedInteger, @@ -869,8 +872,9 @@ pub(crate) fn cuda_backend_get_compression_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_decompress( streams: &CudaStreams, lwe_array_out: &mut CudaLweCiphertextList, @@ -950,8 +954,9 @@ pub(crate) unsafe fn cuda_backend_decompress( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. /// /// 128-bit decompression doesn't execute a PBS as the 64-bit does. /// We have a different entry point because we don't need to carry a bsk to the backend. @@ -1059,8 +1064,9 @@ pub(crate) fn cuda_backend_get_decompression_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_add_assign( streams: &CudaStreams, radix_lwe_left: &mut CudaRadixCiphertext, @@ -1133,8 +1139,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_add_assign( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_mul_assign( streams: &CudaStreams, radix_lwe_left: &mut CudaRadixCiphertext, @@ -1318,8 +1325,9 @@ pub(crate) fn cuda_backend_get_mul_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_bitop_assign( streams: &CudaStreams, radix_lwe_left: &mut CudaRadixCiphertext, @@ -1499,8 +1507,9 @@ pub(crate) fn cuda_backend_get_bitop_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_scalar_bitop_assign( streams: &CudaStreams, radix_lwe: &mut CudaRadixCiphertext, @@ -1655,8 +1664,9 @@ pub(crate) fn cuda_backend_get_scalar_bitop_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_comparison( streams: &CudaStreams, radix_lwe_out: &mut CudaRadixCiphertext, @@ -1863,8 +1873,9 @@ pub(crate) fn cuda_backend_get_comparison_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_scalar_comparison( streams: &CudaStreams, radix_lwe_out: &mut CudaRadixCiphertext, @@ -2003,8 +2014,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_comparison( streams: &CudaStreams, radix_lwe_input: &mut CudaRadixCiphertext, @@ -2141,8 +2153,9 @@ pub(crate) fn cuda_backend_get_full_propagate_assign_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_propagate_single_carry_assign( streams: &CudaStreams, radix_lwe_input: &mut CudaRadixCiphertext, @@ -2369,8 +2382,9 @@ pub(crate) fn cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_sub_and_propagate_single_carry_assign< T: UnsignedInteger, B: Numeric, @@ -2539,8 +2553,9 @@ pub(crate) unsafe fn cuda_backend_sub_and_propagate_single_carry_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_add_and_propagate_single_carry_assign< T: UnsignedInteger, B: Numeric, @@ -2700,8 +2715,9 @@ pub(crate) unsafe fn cuda_backend_add_and_propagate_single_carry_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_grouped_oprf( streams: &CudaStreams, radix_lwe_out: &mut CudaRadixCiphertext, @@ -2838,8 +2854,9 @@ pub(crate) fn cuda_backend_get_grouped_oprf_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_rem< T: UnsignedInteger, B: Numeric, @@ -3031,8 +3048,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_rem< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_rem_assign< T: UnsignedInteger, B: Numeric, @@ -3422,8 +3440,9 @@ where #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_assign< T: UnsignedInteger, B: Numeric, @@ -3595,8 +3614,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_assign< T: UnsignedInteger, B: Numeric, @@ -3736,8 +3756,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_scalar_left_shift_assign< T: UnsignedInteger, B: Numeric, @@ -3831,8 +3852,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_left_shift_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_scalar_logical_right_shift_assign< T: UnsignedInteger, B: Numeric, @@ -3926,8 +3948,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_logical_right_shift_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_scalar_arithmetic_right_shift_assign< T: UnsignedInteger, B: Numeric, @@ -4020,8 +4043,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_arithmetic_right_shift_assign #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_right_shift_assign( streams: &CudaStreams, radix_input: &mut CudaRadixCiphertext, @@ -4137,8 +4161,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_right_shift_assign( streams: &CudaStreams, radix_input: &mut CudaRadixCiphertext, @@ -4254,8 +4279,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_left_shift_assign( streams: &CudaStreams, radix_input: &mut CudaRadixCiphertext, @@ -4376,8 +4402,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_rotate_right_assign( streams: &CudaStreams, radix_input: &mut CudaRadixCiphertext, @@ -4849,8 +4876,9 @@ pub(crate) fn cuda_backend_get_rotate_left_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_cmux( streams: &CudaStreams, radix_lwe_out: &mut CudaRadixCiphertext, @@ -5088,8 +5116,9 @@ pub(crate) fn cuda_backend_get_cmux_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_left_assign< T: UnsignedInteger, B: Numeric, @@ -5185,8 +5214,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_left_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_right_assign< T: UnsignedInteger, B: Numeric, @@ -5380,8 +5410,9 @@ pub(crate) fn get_scalar_rotate_right_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_partial_sum_ciphertexts_assign< T: UnsignedInteger, B: Numeric, @@ -5489,8 +5520,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_partial_sum_ciphertexts_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_extend_radix_with_sign_msb( streams: &CudaStreams, output: &mut CudaRadixCiphertext, @@ -5564,8 +5596,9 @@ pub(crate) unsafe fn cuda_backend_extend_radix_with_sign_msb( streams: &CudaStreams, output: &mut CudaSliceMut, @@ -5670,8 +5703,9 @@ pub(crate) unsafe fn cuda_backend_apply_univariate_lut( streams: &CudaStreams, output: &mut CudaSliceMut, @@ -5780,8 +5814,9 @@ pub(crate) unsafe fn cuda_backend_apply_many_univariate_lut( streams: &CudaStreams, output: &mut CudaSliceMut, @@ -5905,8 +5940,9 @@ pub(crate) unsafe fn cuda_backend_apply_bivariate_lut( streams: &CudaStreams, quotient: &mut CudaRadixCiphertext, @@ -6107,9 +6143,9 @@ pub(crate) fn cuda_backend_get_div_rem_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not -/// be dropped until streams is synchronized. -/// - `output_ct` must be allocated with enough blocks to store the result. +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_count_of_consecutive_bits( streams: &CudaStreams, output_ct: &mut CudaRadixCiphertext, @@ -6221,9 +6257,9 @@ pub(crate) unsafe fn cuda_backend_count_of_consecutive_bits( streams: &CudaStreams, output: &mut CudaRadixCiphertext, @@ -6372,8 +6408,9 @@ pub(crate) unsafe fn cuda_backend_ilog2( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_compute_prefix_sum_hillis_steele< T: UnsignedInteger, B: Numeric, @@ -6489,8 +6526,9 @@ pub(crate) unsafe fn cuda_backend_compute_prefix_sum_hillis_steele< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_unsigned_overflowing_sub_assign< T: UnsignedInteger, B: Numeric, @@ -6644,8 +6682,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_overflowing_sub_assign< #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_signed_abs_assign( streams: &CudaStreams, ct: &mut CudaRadixCiphertext, @@ -6729,8 +6768,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_signed_abs_assign( streams: &CudaStreams, output: &mut CudaSliceMut, @@ -7215,8 +7258,9 @@ pub(crate) unsafe fn cuda_backend_noise_squashing( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_unchecked_aes_ctr_encrypt( streams: &CudaStreams, output: &mut CudaRadixCiphertext, @@ -7479,8 +7524,9 @@ pub(crate) fn cuda_backend_get_aes_ctr_encrypt_size_on_gpu( #[allow(clippy::too_many_arguments)] /// # Safety /// -/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization -/// is required +/// - The data must not be moved or dropped while being used by the CUDA kernel. +/// - This function assumes exclusive access to the passed data; violating this may lead to +/// undefined behavior. pub(crate) unsafe fn cuda_backend_aes_key_expansion( streams: &CudaStreams, expanded_keys: &mut CudaRadixCiphertext,