chore(gpu): add missing syncs in linearalgebra functions and aes

This commit is contained in:
Agnes Leroy
2025-10-13 15:26:10 +02:00
committed by Agnès Leroy
parent c3ed1a7558
commit cf3f25efdd
6 changed files with 167 additions and 149 deletions

View File

@@ -79,6 +79,7 @@ template <typename Torus> struct int_aes_lut_buffers {
this->carry_lut->release(streams);
delete this->carry_lut;
this->carry_lut = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
@@ -140,6 +141,7 @@ template <typename Torus> struct int_aes_round_workspaces {
allocate_gpu_memory);
delete this->vec_tmp_bit_buffer;
this->vec_tmp_bit_buffer = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
@@ -206,12 +208,12 @@ template <typename Torus> struct int_aes_counter_workspaces {
delete this->vec_trivial_b_bits_buffer;
this->vec_trivial_b_bits_buffer = nullptr;
free(this->h_counter_bits_buffer);
if (allocate_gpu_memory) {
cuda_drop_async(this->d_counter_bits_buffer, streams.stream(0),
streams.gpu_index(0));
streams.synchronize();
}
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
free(this->h_counter_bits_buffer);
}
};
@@ -303,6 +305,7 @@ template <typename Torus> struct int_aes_main_workspaces {
allocate_gpu_memory);
delete this->batch_processing_buffer;
this->batch_processing_buffer = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
@@ -366,6 +369,7 @@ template <typename Torus> struct int_aes_encrypt_buffer {
main_workspaces->release(streams, allocate_gpu_memory);
delete main_workspaces;
main_workspaces = nullptr;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
@@ -434,6 +438,7 @@ template <typename Torus> struct int_key_expansion_buffer {
this->aes_encrypt_buffer->release(streams);
delete this->aes_encrypt_buffer;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};

View File

@@ -53,6 +53,7 @@ __host__ void host_scalar_addition_inplace(
for (uint i = 0; i < num_scalars; i++) {
lwe_array->degrees[i] = lwe_array->degrees[i] + h_scalar_input[i];
}
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
template <typename Torus>
@@ -93,6 +94,7 @@ __host__ void host_add_scalar_one_inplace(CudaStreams streams,
for (uint i = 0; i < lwe_array->num_radix_blocks; i++) {
lwe_array->degrees[i] = lwe_array->degrees[i] + 1;
}
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
template <typename Torus>
@@ -134,5 +136,6 @@ __host__ void host_scalar_subtraction_inplace(
input_lwe_ciphertext_count,
lwe_dimension, delta);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
#endif

View File

@@ -2,15 +2,12 @@
#define CUDA_ADD_CUH
#ifdef __CDT_PARSER__
#undef __CUDA_RUNTIME_H__
#include <cuda_runtime.h>
#endif
#include "device.h"
#include "helper_multi_gpu.h"
#include "integer/integer.h"
#include "integer/integer_utilities.h"
#include "linear_algebra.h"
#include "utils/kernel_dimensions.cuh"
#include <stdio.h>
@@ -65,6 +62,7 @@ __host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index,
plaintext_addition<T><<<grid, thds, 0, stream>>>(
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
}
template <typename T>
@@ -86,6 +84,7 @@ __host__ void host_addition_plaintext_scalar(
plaintext_addition_scalar<T><<<grid, thds, 0, stream>>>(
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
}
template <typename T>
@@ -139,6 +138,7 @@ host_addition(cudaStream_t stream, uint32_t gpu_index,
input_1->noise_levels[i] + input_2->noise_levels[i];
CHECK_NOISE_LEVEL(output->noise_levels[i], message_modulus, carry_modulus);
}
cuda_synchronize_stream(stream, gpu_index);
}
template <typename T>
@@ -200,46 +200,6 @@ __host__ void host_add_the_same_block_to_all_blocks(
}
}
template <typename T>
__global__ void pack_for_overflowing_ops(T *output, T const *input_1,
T const *input_2, uint32_t num_entries,
uint32_t message_modulus) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
if (index < num_entries) {
// Here we take advantage of the wrapping behaviour of uint
output[index] = input_1[index] * message_modulus + input_2[index];
}
}
template <typename T>
__host__ void host_pack_for_overflowing_ops(cudaStream_t stream,
uint32_t gpu_index, T *output,
T const *input_1, T const *input_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus) {
cuda_set_device(gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = lwe_size;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
pack_for_overflowing_ops<T><<<grid, thds, 0, stream>>>(
&output[(input_lwe_ciphertext_count - 1) * lwe_size],
&input_1[(input_lwe_ciphertext_count - 1) * lwe_size],
&input_2[(input_lwe_ciphertext_count - 1) * lwe_size], lwe_size,
message_modulus);
check_cuda_error(cudaGetLastError());
}
template <typename T>
__global__ void subtraction(T *output, T const *input_1, T const *input_2,
uint32_t num_entries) {
@@ -273,6 +233,7 @@ __host__ void host_subtraction(cudaStream_t stream, uint32_t gpu_index,
subtraction<T>
<<<grid, thds, 0, stream>>>(output, input_1, input_2, num_entries);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
}
template <typename T>
@@ -312,6 +273,7 @@ __host__ void host_subtraction_plaintext(cudaStream_t stream,
radix_body_subtraction_inplace<T><<<grid, thds, 0, stream>>>(
output, plaintext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
}
template <typename T>

View File

@@ -50,6 +50,7 @@ __host__ void host_cleartext_vec_multiplication(
cleartext_vec_multiplication<T><<<grid, thds, 0, stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
}
template <typename T>

View File

@@ -39,6 +39,7 @@ __host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
negation<T><<<grid, thds, 0, stream>>>(output, input, num_entries);
check_cuda_error(cudaGetLastError());
cuda_synchronize_stream(stream, gpu_index);
}
#endif // CUDA_NEGATE_H

View File

@@ -304,8 +304,9 @@ where
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_scalar_addition_assign<T: UnsignedInteger>(
streams: &CudaStreams,
lwe_array: &mut CudaRadixCiphertext,
@@ -356,8 +357,9 @@ pub(crate) unsafe fn cuda_backend_scalar_addition_assign<T: UnsignedInteger>(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_scalar_mul<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
lwe_array: &mut CudaRadixCiphertext,
@@ -716,8 +718,9 @@ where
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_compress<
InputTorus: UnsignedInteger,
OutputTorus: UnsignedInteger,
@@ -869,8 +872,9 @@ pub(crate) fn cuda_backend_get_compression_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_decompress<B: Numeric>(
streams: &CudaStreams,
lwe_array_out: &mut CudaLweCiphertextList<u64>,
@@ -950,8 +954,9 @@ pub(crate) unsafe fn cuda_backend_decompress<B: Numeric>(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
///
/// 128-bit decompression doesn't execute a PBS as the 64-bit does.
/// We have a different entry point because we don't need to carry a bsk to the backend.
@@ -1059,8 +1064,9 @@ pub(crate) fn cuda_backend_get_decompression_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_add_assign(
streams: &CudaStreams,
radix_lwe_left: &mut CudaRadixCiphertext,
@@ -1133,8 +1139,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_add_assign(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_mul_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_left: &mut CudaRadixCiphertext,
@@ -1318,8 +1325,9 @@ pub(crate) fn cuda_backend_get_mul_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_bitop_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_left: &mut CudaRadixCiphertext,
@@ -1499,8 +1507,9 @@ pub(crate) fn cuda_backend_get_bitop_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_scalar_bitop_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe: &mut CudaRadixCiphertext,
@@ -1655,8 +1664,9 @@ pub(crate) fn cuda_backend_get_scalar_bitop_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_comparison<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_out: &mut CudaRadixCiphertext,
@@ -1863,8 +1873,9 @@ pub(crate) fn cuda_backend_get_comparison_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_scalar_comparison<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_out: &mut CudaRadixCiphertext,
@@ -2003,8 +2014,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_comparison<T: UnsignedInteger
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_full_propagate_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_input: &mut CudaRadixCiphertext,
@@ -2141,8 +2153,9 @@ pub(crate) fn cuda_backend_get_full_propagate_assign_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_propagate_single_carry_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_input: &mut CudaRadixCiphertext,
@@ -2369,8 +2382,9 @@ pub(crate) fn cuda_backend_get_add_and_propagate_single_carry_assign_size_on_gpu
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_sub_and_propagate_single_carry_assign<
T: UnsignedInteger,
B: Numeric,
@@ -2539,8 +2553,9 @@ pub(crate) unsafe fn cuda_backend_sub_and_propagate_single_carry_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_add_and_propagate_single_carry_assign<
T: UnsignedInteger,
B: Numeric,
@@ -2700,8 +2715,9 @@ pub(crate) unsafe fn cuda_backend_add_and_propagate_single_carry_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_grouped_oprf<B: Numeric>(
streams: &CudaStreams,
radix_lwe_out: &mut CudaRadixCiphertext,
@@ -2838,8 +2854,9 @@ pub(crate) fn cuda_backend_get_grouped_oprf_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_rem<
T: UnsignedInteger,
B: Numeric,
@@ -3031,8 +3048,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_rem<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_rem_assign<
T: UnsignedInteger,
B: Numeric,
@@ -3422,8 +3440,9 @@ where
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_assign<
T: UnsignedInteger,
B: Numeric,
@@ -3595,8 +3614,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_scalar_div_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_assign<
T: UnsignedInteger,
B: Numeric,
@@ -3736,8 +3756,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_signed_scalar_div_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_scalar_left_shift_assign<
T: UnsignedInteger,
B: Numeric,
@@ -3831,8 +3852,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_left_shift_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_scalar_logical_right_shift_assign<
T: UnsignedInteger,
B: Numeric,
@@ -3926,8 +3948,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_logical_right_shift_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_scalar_arithmetic_right_shift_assign<
T: UnsignedInteger,
B: Numeric,
@@ -4020,8 +4043,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_arithmetic_right_shift_assign
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_right_shift_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_input: &mut CudaRadixCiphertext,
@@ -4137,8 +4161,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_right_shift_assign<T: UnsignedIntege
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_left_shift_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_input: &mut CudaRadixCiphertext,
@@ -4254,8 +4279,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_left_shift_assign<T: UnsignedInteger
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_rotate_right_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_input: &mut CudaRadixCiphertext,
@@ -4376,8 +4402,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_rotate_right_assign<T: UnsignedInteg
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_rotate_left_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_input: &mut CudaRadixCiphertext,
@@ -4849,8 +4876,9 @@ pub(crate) fn cuda_backend_get_rotate_left_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_cmux<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
radix_lwe_out: &mut CudaRadixCiphertext,
@@ -5088,8 +5116,9 @@ pub(crate) fn cuda_backend_get_cmux_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_left_assign<
T: UnsignedInteger,
B: Numeric,
@@ -5185,8 +5214,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_left_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_scalar_rotate_right_assign<
T: UnsignedInteger,
B: Numeric,
@@ -5380,8 +5410,9 @@ pub(crate) fn get_scalar_rotate_right_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_partial_sum_ciphertexts_assign<
T: UnsignedInteger,
B: Numeric,
@@ -5489,8 +5520,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_partial_sum_ciphertexts_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_extend_radix_with_sign_msb<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
output: &mut CudaRadixCiphertext,
@@ -5564,8 +5596,9 @@ pub(crate) unsafe fn cuda_backend_extend_radix_with_sign_msb<T: UnsignedInteger,
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_apply_univariate_lut<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
output: &mut CudaSliceMut<T>,
@@ -5670,8 +5703,9 @@ pub(crate) unsafe fn cuda_backend_apply_univariate_lut<T: UnsignedInteger, B: Nu
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_apply_many_univariate_lut<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
output: &mut CudaSliceMut<T>,
@@ -5780,8 +5814,9 @@ pub(crate) unsafe fn cuda_backend_apply_many_univariate_lut<T: UnsignedInteger,
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_apply_bivariate_lut<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
output: &mut CudaSliceMut<T>,
@@ -5905,8 +5940,9 @@ pub(crate) unsafe fn cuda_backend_apply_bivariate_lut<T: UnsignedInteger, B: Num
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_div_rem_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
quotient: &mut CudaRadixCiphertext,
@@ -6107,9 +6143,9 @@ pub(crate) fn cuda_backend_get_div_rem_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until streams is synchronized.
/// - `output_ct` must be allocated with enough blocks to store the result.
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_count_of_consecutive_bits<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
output_ct: &mut CudaRadixCiphertext,
@@ -6221,9 +6257,9 @@ pub(crate) unsafe fn cuda_backend_count_of_consecutive_bits<T: UnsignedInteger,
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until streams is synchronized.
/// - `output_ct` must be allocated with enough blocks to store the result.
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_ilog2<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
output: &mut CudaRadixCiphertext,
@@ -6372,8 +6408,9 @@ pub(crate) unsafe fn cuda_backend_ilog2<T: UnsignedInteger, B: Numeric>(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_compute_prefix_sum_hillis_steele<
T: UnsignedInteger,
B: Numeric,
@@ -6489,8 +6526,9 @@ pub(crate) unsafe fn cuda_backend_compute_prefix_sum_hillis_steele<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_unsigned_overflowing_sub_assign<
T: UnsignedInteger,
B: Numeric,
@@ -6644,8 +6682,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_unsigned_overflowing_sub_assign<
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_signed_abs_assign<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
ct: &mut CudaRadixCiphertext,
@@ -6729,8 +6768,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_signed_abs_assign<T: UnsignedInteger
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_is_at_least_one_comparisons_block_true<
T: UnsignedInteger,
B: Numeric,
@@ -6859,8 +6899,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_is_at_least_one_comparisons_block_tr
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_are_all_comparisons_block_true<
T: UnsignedInteger,
B: Numeric,
@@ -6987,12 +7028,11 @@ pub(crate) unsafe fn cuda_backend_unchecked_are_all_comparisons_block_true<
}
#[allow(clippy::too_many_arguments)]
/// Assign negation of a vector of LWE ciphertexts representing an integer
///
/// # Safety
///
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_negate(
streams: &CudaStreams,
radix_lwe_out: &mut CudaRadixCiphertext,
@@ -7048,8 +7088,9 @@ pub(crate) unsafe fn cuda_backend_unchecked_negate(
/// # Safety
///
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until streams is synchronized
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_trim_radix_blocks_lsb(
output: &mut CudaRadixCiphertext,
input: &CudaRadixCiphertext,
@@ -7075,8 +7116,9 @@ pub(crate) unsafe fn cuda_backend_trim_radix_blocks_lsb(
/// # Safety
///
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until streams is synchronized
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_extend_radix_with_trivial_zero_blocks_msb(
output: &mut CudaRadixCiphertext,
input: &CudaRadixCiphertext,
@@ -7104,8 +7146,9 @@ pub(crate) unsafe fn cuda_backend_extend_radix_with_trivial_zero_blocks_msb(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_noise_squashing<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
output: &mut CudaSliceMut<T>,
@@ -7215,8 +7258,9 @@ pub(crate) unsafe fn cuda_backend_noise_squashing<T: UnsignedInteger, B: Numeric
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
///
///
/// In this method, the input `lwe_flattened_compact_array_in` represents a flattened compact list.
@@ -7334,8 +7378,9 @@ pub(crate) unsafe fn cuda_backend_expand<T: UnsignedInteger, B: Numeric>(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_unchecked_aes_ctr_encrypt<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
output: &mut CudaRadixCiphertext,
@@ -7479,8 +7524,9 @@ pub(crate) fn cuda_backend_get_aes_ctr_encrypt_size_on_gpu(
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
/// - The data must not be moved or dropped while being used by the CUDA kernel.
/// - This function assumes exclusive access to the passed data; violating this may lead to
/// undefined behavior.
pub(crate) unsafe fn cuda_backend_aes_key_expansion<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
expanded_keys: &mut CudaRadixCiphertext,