feat(gpu): aes 128

This commit is contained in:
Enzo Di Maria
2025-08-26 16:07:54 +02:00
committed by enzodimaria
parent 0604d237eb
commit f0f3dd76eb
17 changed files with 3192 additions and 0 deletions

View File

@@ -7403,3 +7403,289 @@ pub unsafe fn expand_async<T: UnsignedInteger, B: Numeric>(
);
cleanup_expand_without_verification_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async<
T: UnsignedInteger,
B: Numeric,
>(
streams: &CudaStreams,
output: &mut CudaRadixCiphertext,
iv: &CudaRadixCiphertext,
round_keys: &CudaRadixCiphertext,
start_counter: u128,
num_aes_inputs: u32,
sbox_parallelism: u32,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
lwe_dimension: LweDimension,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
grouping_factor: LweBskGroupingFactor,
pbs_type: PBSType,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
let mut output_degrees = output.info.blocks.iter().map(|b| b.degree.0).collect();
let mut output_noise_levels = output.info.blocks.iter().map(|b| b.noise_level.0).collect();
let mut cuda_ffi_output =
prepare_cuda_radix_ffi(output, &mut output_degrees, &mut output_noise_levels);
let mut iv_degrees = iv.info.blocks.iter().map(|b| b.degree.0).collect();
let mut iv_noise_levels = iv.info.blocks.iter().map(|b| b.noise_level.0).collect();
let cuda_ffi_iv = prepare_cuda_radix_ffi(iv, &mut iv_degrees, &mut iv_noise_levels);
let mut round_keys_degrees = round_keys.info.blocks.iter().map(|b| b.degree.0).collect();
let mut round_keys_noise_levels = round_keys
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let cuda_ffi_round_keys = prepare_cuda_radix_ffi(
round_keys,
&mut round_keys_degrees,
&mut round_keys_noise_levels,
);
let noise_reduction_type = resolve_noise_reduction_type(ms_noise_reduction_configuration);
let counter_bits_le: Vec<u64> = (0..num_aes_inputs)
.flat_map(|i| {
let current_counter = start_counter + i as u128;
(0..128).map(move |bit_index| ((current_counter >> bit_index) & 1) as u64)
})
.collect();
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
scratch_cuda_integer_aes_encrypt_64(
streams.ffi(),
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
true,
noise_reduction_type as u32,
num_aes_inputs,
sbox_parallelism,
);
cuda_integer_aes_ctr_encrypt_64(
streams.ffi(),
&raw mut cuda_ffi_output,
&raw const cuda_ffi_iv,
&raw const cuda_ffi_round_keys,
counter_bits_le.as_ptr(),
num_aes_inputs,
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
);
cleanup_cuda_integer_aes_encrypt_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
update_noise_degree(output, &cuda_ffi_output);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn get_aes_ctr_encrypt_integer_radix_size_on_gpu(
streams: &CudaStreams,
num_aes_inputs: u32,
sbox_parallelism: u32,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
lwe_dimension: LweDimension,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
grouping_factor: LweBskGroupingFactor,
pbs_type: PBSType,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) -> u64 {
let noise_reduction_type = resolve_noise_reduction_type(ms_noise_reduction_configuration);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let size = unsafe {
scratch_cuda_integer_aes_encrypt_64(
streams.ffi(),
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
false,
noise_reduction_type as u32,
num_aes_inputs,
sbox_parallelism,
)
};
unsafe { cleanup_cuda_integer_aes_encrypt_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr)) };
size
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn unchecked_key_expansion_integer_radix_kb_assign_async<
T: UnsignedInteger,
B: Numeric,
>(
streams: &CudaStreams,
expanded_keys: &mut CudaRadixCiphertext,
key: &CudaRadixCiphertext,
bootstrapping_key: &CudaVec<B>,
keyswitch_key: &CudaVec<T>,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
lwe_dimension: LweDimension,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
grouping_factor: LweBskGroupingFactor,
pbs_type: PBSType,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) {
let mut expanded_keys_degrees = expanded_keys
.info
.blocks
.iter()
.map(|b| b.degree.0)
.collect();
let mut expanded_keys_noise_levels = expanded_keys
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let mut cuda_ffi_expanded_keys = prepare_cuda_radix_ffi(
expanded_keys,
&mut expanded_keys_degrees,
&mut expanded_keys_noise_levels,
);
let mut key_degrees = key.info.blocks.iter().map(|b| b.degree.0).collect();
let mut key_noise_levels = key.info.blocks.iter().map(|b| b.noise_level.0).collect();
let cuda_ffi_key = prepare_cuda_radix_ffi(key, &mut key_degrees, &mut key_noise_levels);
let noise_reduction_type = resolve_ms_noise_reduction_config(ms_noise_reduction_configuration);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
scratch_cuda_integer_key_expansion_64(
streams.ffi(),
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
true,
noise_reduction_type as u32,
);
cuda_integer_key_expansion_64(
streams.ffi(),
&raw mut cuda_ffi_expanded_keys,
&raw const cuda_ffi_key,
mem_ptr,
bootstrapping_key.ptr.as_ptr(),
keyswitch_key.ptr.as_ptr(),
);
cleanup_cuda_integer_key_expansion_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
update_noise_degree(expanded_keys, &cuda_ffi_expanded_keys);
}
#[allow(clippy::too_many_arguments)]
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
/// is required
pub unsafe fn get_key_expansion_integer_radix_size_on_gpu(
streams: &CudaStreams,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
glwe_dimension: GlweDimension,
polynomial_size: PolynomialSize,
lwe_dimension: LweDimension,
ks_level: DecompositionLevelCount,
ks_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
pbs_base_log: DecompositionBaseLog,
grouping_factor: LweBskGroupingFactor,
pbs_type: PBSType,
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
) -> u64 {
let noise_reduction_type = resolve_noise_reduction_type(ms_noise_reduction_configuration);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let size = {
scratch_cuda_integer_key_expansion_64(
streams.ffi(),
std::ptr::addr_of_mut!(mem_ptr),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
lwe_dimension.0 as u32,
ks_level.0 as u32,
ks_base_log.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
true,
noise_reduction_type as u32,
)
};
unsafe {
cleanup_cuda_integer_key_expansion_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr))
};
size
}

View File

@@ -0,0 +1,465 @@
use crate::core_crypto::gpu::{
check_valid_cuda_malloc, check_valid_cuda_malloc_assert_oom, CudaStreams,
};
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::{
get_aes_ctr_encrypt_integer_radix_size_on_gpu, get_key_expansion_integer_radix_size_on_gpu,
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async,
unchecked_key_expansion_integer_radix_kb_assign_async, PBSType,
};
use crate::integer::{RadixCiphertext, RadixClientKey};
use crate::shortint::Ciphertext;
const NUM_BITS: usize = 128;
impl RadixClientKey {
/// Encrypts a 128-bit block for homomorphic AES evaluation.
///
/// This function prepares a 128-bit plaintext block (like an AES key or IV)
/// for homomorphic processing by decomposing it into its 128 constituent bits
/// and encrypting each bit individually with FHE.
///
/// The process is as follows:
/// ```text
/// // INPUT: A 128-bit plaintext block
/// Plaintext block (u128): 0x2b7e1516...
/// |
/// V
/// // 1. Decompose the block into individual bits
/// Individual bits: [b127, b126, ..., b1, b0]
/// |
/// V
/// // 2. Encrypt each bit individually using FHE
/// `self.encrypt(bit)` is applied to each bit
/// |
/// V
/// // 3. Collect the resulting bit-ciphertexts
/// Ciphertexts: [Ct(b127), Ct(b126), ..., Ct(b0)]
/// |
/// V
/// // 4. Group the bit-ciphertexts into a single RadixCiphertext
/// // representing the full encrypted block.
/// // OUTPUT: A RadixCiphertext
/// ```
pub fn encrypt_u128_for_aes_ctr(&self, data: u128) -> RadixCiphertext {
let mut blocks: Vec<Ciphertext> = Vec::with_capacity(NUM_BITS);
for i in 0..NUM_BITS {
let bit = ((data >> (NUM_BITS - 1 - i)) & 1) as u64;
blocks.extend(self.encrypt(bit).blocks);
}
RadixCiphertext::from(blocks)
}
/// Decrypts a `RadixCiphertext` containing one or more 128-bit blocks
/// that were homomorphically processed.
///
/// This function reverses the encryption process by decrypting each individual
/// bit-ciphertext and reassembling them into 128-bit plaintext blocks.
///
/// The process is as follows:
/// ```text
/// // INPUT: RadixCiphertext containing one or more encrypted blocks
/// Ciphertext collection: [Ct(b127), ..., Ct(b0), Ct(b'127), ..., Ct(b'0), ...]
/// |
/// | (For each sequence of 128 bit-ciphertexts)
/// V
/// // 1. Decrypt each bit's ciphertext individually
/// `self.decrypt(Ct)` is applied to each bit-ciphertext
/// |
/// V
/// // 2. Collect the resulting plaintext bits
/// Plaintext bits: [b127, b126, ..., b0]
/// |
/// V
/// // 3. Assemble the bits back into a 128-bit block
/// Reconstruction: ( ...((b127 << 1) | b126) << 1 | ... ) | b0
/// |
/// V
/// // OUTPUT: A vector of plaintext u128 blocks
/// Plaintext u128s: [0x..., ...]
/// ```
pub fn decrypt_u128_from_aes_ctr(
&self,
encrypted_result: &RadixCiphertext,
num_aes_inputs: usize,
) -> Vec<u128> {
let mut plaintext_results = Vec::with_capacity(num_aes_inputs);
for i in 0..num_aes_inputs {
let mut current_block_plaintext: u128 = 0;
let block_start_index = i * NUM_BITS;
for j in 0..NUM_BITS {
let block_slice =
&encrypted_result.blocks[block_start_index + j..block_start_index + j + 1];
let block_radix_ct = RadixCiphertext::from(block_slice.to_vec());
let decrypted_bit: u128 = self.decrypt(&block_radix_ct);
current_block_plaintext = (current_block_plaintext << 1) | decrypted_bit;
}
plaintext_results.push(current_block_plaintext);
}
plaintext_results
}
}
impl CudaServerKey {
pub fn aes_ctr(
&self,
key: &CudaUnsignedRadixCiphertext,
iv: &CudaUnsignedRadixCiphertext,
start_counter: u128,
num_aes_inputs: usize,
streams: &CudaStreams,
) -> CudaUnsignedRadixCiphertext {
let gpu_index = streams.gpu_indexes[0];
let key_expansion_size = self.get_key_expansion_size_on_gpu(streams);
check_valid_cuda_malloc_assert_oom(key_expansion_size, gpu_index);
// `parallelism` refers to level of parallelization of the S-box.
// S-box should process 16 bytes of data: sequentially, or in groups of 2,
// or in groups of 4, or in groups of 8, or all 16 at the same time.
// More parallelization leads to higher memory usage. Therefore, we must find a way
// to maximize parallelization while ensuring that there is still enough memory remaining on
// the GPU.
//
let mut parallelism = 16;
while parallelism > 0 {
// `num_aes_inputs` refers to the number of 128-bit ciphertexts that AES will produce.
//
let aes_encrypt_size =
self.get_aes_encrypt_size_on_gpu(num_aes_inputs, parallelism, streams);
if check_valid_cuda_malloc(aes_encrypt_size, streams.gpu_indexes[0]) {
let round_keys = unsafe { self.key_expansion_async(key, streams) };
let res = unsafe {
self.aes_encrypt_async(
iv,
&round_keys,
start_counter,
num_aes_inputs,
parallelism,
streams,
)
};
streams.synchronize();
return res;
}
parallelism /= 2;
}
panic!("Failed to allocate GPU memory for AES, even with the lowest parallelism setting.");
}
pub fn aes_ctr_with_fixed_parallelism(
&self,
key: &CudaUnsignedRadixCiphertext,
iv: &CudaUnsignedRadixCiphertext,
start_counter: u128,
num_aes_inputs: usize,
sbox_parallelism: usize,
streams: &CudaStreams,
) -> CudaUnsignedRadixCiphertext {
assert!(
[1, 2, 4, 8, 16].contains(&sbox_parallelism),
"Invalid S-Box parallelism: must be one of [1, 2, 4, 8, 16], got {sbox_parallelism}"
);
let gpu_index = streams.gpu_indexes[0];
let key_expansion_size = self.get_key_expansion_size_on_gpu(streams);
check_valid_cuda_malloc_assert_oom(key_expansion_size, gpu_index);
let aes_encrypt_size =
self.get_aes_encrypt_size_on_gpu(num_aes_inputs, sbox_parallelism, streams);
check_valid_cuda_malloc_assert_oom(aes_encrypt_size, gpu_index);
let round_keys = unsafe { self.key_expansion_async(key, streams) };
let res = unsafe {
self.aes_encrypt_async(
iv,
&round_keys,
start_counter,
num_aes_inputs,
sbox_parallelism,
streams,
)
};
streams.synchronize();
res
}
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as
/// synchronization is required
pub unsafe fn aes_encrypt_async(
&self,
iv: &CudaUnsignedRadixCiphertext,
round_keys: &CudaUnsignedRadixCiphertext,
start_counter: u128,
num_aes_inputs: usize,
sbox_parallelism: usize,
streams: &CudaStreams,
) -> CudaUnsignedRadixCiphertext {
let mut result: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(num_aes_inputs * 128, streams);
let num_round_key_blocks = 11 * NUM_BITS;
assert_eq!(
iv.as_ref().d_blocks.lwe_ciphertext_count().0,
NUM_BITS,
"AES IV must contain {NUM_BITS} encrypted bits, but contains {}",
iv.as_ref().d_blocks.lwe_ciphertext_count().0
);
assert_eq!(
round_keys.as_ref().d_blocks.lwe_ciphertext_count().0,
num_round_key_blocks,
"AES round_keys must contain {num_round_key_blocks} encrypted bits, but contains {}",
round_keys.as_ref().d_blocks.lwe_ciphertext_count().0
);
assert_eq!(
result.as_ref().d_blocks.lwe_ciphertext_count().0,
num_aes_inputs * 128,
"AES result must contain {} encrypted bits for {num_aes_inputs} blocks, but contains {}",
num_aes_inputs * 128,
result.as_ref().d_blocks.lwe_ciphertext_count().0
);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
streams,
result.as_mut(),
iv.as_ref(),
round_keys.as_ref(),
start_counter,
num_aes_inputs as u32,
sbox_parallelism as u32,
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
LweBskGroupingFactor(0),
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_aes_ctr_encrypt_integer_radix_kb_assign_async(
streams,
result.as_mut(),
iv.as_ref(),
round_keys.as_ref(),
start_counter,
num_aes_inputs as u32,
sbox_parallelism as u32,
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
d_multibit_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.grouping_factor,
PBSType::MultiBit,
None,
);
}
}
result
}
fn get_aes_encrypt_size_on_gpu(
&self,
num_aes_inputs: usize,
sbox_parallelism: usize,
streams: &CudaStreams,
) -> u64 {
let size = unsafe {
self.get_aes_encrypt_size_on_gpu_async(num_aes_inputs, sbox_parallelism, streams)
};
streams.synchronize();
size
}
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as
/// synchronization is required
unsafe fn get_aes_encrypt_size_on_gpu_async(
&self,
num_aes_inputs: usize,
sbox_parallelism: usize,
streams: &CudaStreams,
) -> u64 {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_aes_ctr_encrypt_integer_radix_size_on_gpu(
streams,
num_aes_inputs as u32,
sbox_parallelism as u32,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
LweBskGroupingFactor(0),
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_aes_ctr_encrypt_integer_radix_size_on_gpu(
streams,
num_aes_inputs as u32,
sbox_parallelism as u32,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
d_multibit_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.grouping_factor,
PBSType::MultiBit,
None,
)
}
}
}
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as
/// synchronization is required
pub unsafe fn key_expansion_async(
&self,
key: &CudaUnsignedRadixCiphertext,
streams: &CudaStreams,
) -> CudaUnsignedRadixCiphertext {
let num_round_keys = 11;
let num_key_bits = 128;
let mut expanded_keys: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(num_round_keys * num_key_bits, streams);
assert_eq!(
key.as_ref().d_blocks.lwe_ciphertext_count().0,
num_key_bits,
"Input key must contain {} encrypted bits, but contains {}",
num_key_bits,
key.as_ref().d_blocks.lwe_ciphertext_count().0
);
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
unchecked_key_expansion_integer_radix_kb_assign_async(
streams,
expanded_keys.as_mut(),
key.as_ref(),
&d_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
LweBskGroupingFactor(0),
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
unchecked_key_expansion_integer_radix_kb_assign_async(
streams,
expanded_keys.as_mut(),
key.as_ref(),
&d_multibit_bsk.d_vec,
&self.key_switching_key.d_vec,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
d_multibit_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.grouping_factor,
PBSType::MultiBit,
None,
);
}
}
expanded_keys
}
fn get_key_expansion_size_on_gpu(&self, streams: &CudaStreams) -> u64 {
let size = unsafe { self.get_key_expansion_size_on_gpu_async(streams) };
streams.synchronize();
size
}
/// # Safety
///
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as
/// synchronization is required
unsafe fn get_key_expansion_size_on_gpu_async(&self, streams: &CudaStreams) -> u64 {
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => get_key_expansion_integer_radix_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_bsk.glwe_dimension,
d_bsk.polynomial_size,
d_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_bsk.decomp_level_count,
d_bsk.decomp_base_log,
LweBskGroupingFactor(0),
PBSType::Classical,
d_bsk.ms_noise_reduction_configuration.as_ref(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
get_key_expansion_integer_radix_size_on_gpu(
streams,
self.message_modulus,
self.carry_modulus,
d_multibit_bsk.glwe_dimension,
d_multibit_bsk.polynomial_size,
d_multibit_bsk.input_lwe_dimension,
self.key_switching_key.decomposition_level_count(),
self.key_switching_key.decomposition_base_log(),
d_multibit_bsk.decomp_level_count,
d_multibit_bsk.decomp_base_log,
d_multibit_bsk.grouping_factor,
PBSType::MultiBit,
None,
)
}
}
}
}

View File

@@ -57,6 +57,7 @@ mod sub;
mod vector_comparisons;
mod vector_find;
mod aes;
#[cfg(test)]
mod tests_long_run;
#[cfg(test)]

View File

@@ -1,4 +1,5 @@
pub(crate) mod test_add;
pub(crate) mod test_aes;
pub(crate) mod test_bitwise_op;
pub(crate) mod test_cmux;
pub(crate) mod test_comparison;
@@ -82,6 +83,98 @@ impl<F> GpuFunctionExecutor<F> {
}
}
impl<'a, F>
FunctionExecutor<
(&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
RadixCiphertext,
> for GpuFunctionExecutor<F>
where
F: Fn(
&CudaServerKey,
&CudaUnsignedRadixCiphertext,
&CudaUnsignedRadixCiphertext,
u128,
usize,
usize,
&CudaStreams,
) -> CudaUnsignedRadixCiphertext,
{
fn setup(&mut self, cks: &RadixClientKey, sks: Arc<ServerKey>) {
self.setup_from_keys(cks, &sks);
}
fn execute(
&mut self,
input: (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
) -> RadixCiphertext {
let context = self
.context
.as_ref()
.expect("setup was not properly called");
let d_ctxt_1 =
CudaUnsignedRadixCiphertext::from_radix_ciphertext(input.0, &context.streams);
let d_ctxt_2 =
CudaUnsignedRadixCiphertext::from_radix_ciphertext(input.1, &context.streams);
let gpu_result = (self.func)(
&context.sks,
&d_ctxt_1,
&d_ctxt_2,
input.2,
input.3,
input.4,
&context.streams,
);
gpu_result.to_radix_ciphertext(&context.streams)
}
}
impl<'a, F>
FunctionExecutor<(&'a RadixCiphertext, &'a RadixCiphertext, u128, usize), RadixCiphertext>
for GpuFunctionExecutor<F>
where
F: Fn(
&CudaServerKey,
&CudaUnsignedRadixCiphertext,
&CudaUnsignedRadixCiphertext,
u128,
usize,
&CudaStreams,
) -> CudaUnsignedRadixCiphertext,
{
fn setup(&mut self, cks: &RadixClientKey, sks: Arc<ServerKey>) {
self.setup_from_keys(cks, &sks);
}
fn execute(
&mut self,
input: (&'a RadixCiphertext, &'a RadixCiphertext, u128, usize),
) -> RadixCiphertext {
let context = self
.context
.as_ref()
.expect("setup was not properly called");
let d_ctxt_1 =
CudaUnsignedRadixCiphertext::from_radix_ciphertext(input.0, &context.streams);
let d_ctxt_2 =
CudaUnsignedRadixCiphertext::from_radix_ciphertext(input.1, &context.streams);
let gpu_result = (self.func)(
&context.sks,
&d_ctxt_1,
&d_ctxt_2,
input.2,
input.3,
&context.streams,
);
gpu_result.to_radix_ciphertext(&context.streams)
}
}
/// For default/unchecked binary functions
impl<'a, F> FunctionExecutor<(&'a RadixCiphertext, &'a RadixCiphertext), RadixCiphertext>
for GpuFunctionExecutor<F>

View File

@@ -0,0 +1,59 @@
use crate::integer::gpu::server_key::radix::tests_unsigned::{
create_gpu_parameterized_test, GpuFunctionExecutor,
};
use crate::integer::gpu::CudaServerKey;
use crate::integer::server_key::radix_parallel::tests_cases_unsigned::{
aes_dynamic_parallelism_many_inputs_test, aes_fixed_parallelism_1_input_test,
aes_fixed_parallelism_2_inputs_test,
};
use crate::shortint::parameters::{
TestParameters, PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};
create_gpu_parameterized_test!(integer_aes_fixed_parallelism_1_input {
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
});
create_gpu_parameterized_test!(integer_aes_fixed_parallelism_2_inputs {
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
});
create_gpu_parameterized_test!(integer_aes_dynamic_parallelism_many_inputs {
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M128,
PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128
});
// The following two tests are referred to as "fixed_parallelism" because the objective is to test
// AES, in CTR mode, across all possible parallelizations of the S-box. The S-box must process 16
// bytes; the parallelization refers to the number of bytes it will process in parallel in one call:
// 1, 2, 4, 8, or 16.
//
fn integer_aes_fixed_parallelism_1_input<P>(param: P)
where
P: Into<TestParameters>,
{
let executor = GpuFunctionExecutor::new(&CudaServerKey::aes_ctr_with_fixed_parallelism);
aes_fixed_parallelism_1_input_test(param, executor);
}
fn integer_aes_fixed_parallelism_2_inputs<P>(param: P)
where
P: Into<TestParameters>,
{
let executor = GpuFunctionExecutor::new(&CudaServerKey::aes_ctr_with_fixed_parallelism);
aes_fixed_parallelism_2_inputs_test(param, executor);
}
// The test referred to as "dynamic_parallelism" will seek the maximum s-box parallelization that
// the machine can support.
//
fn integer_aes_dynamic_parallelism_many_inputs<P>(param: P)
where
P: Into<TestParameters>,
{
let executor = GpuFunctionExecutor::new(&CudaServerKey::aes_ctr);
aes_dynamic_parallelism_many_inputs_test(param, executor);
}

View File

@@ -43,6 +43,11 @@ pub(crate) use crate::integer::server_key::radix_parallel::tests_unsigned::test_
default_add_test, unchecked_add_assign_test,
};
#[cfg(feature = "gpu")]
pub(crate) use crate::integer::server_key::radix_parallel::tests_unsigned::test_aes::{
aes_dynamic_parallelism_many_inputs_test, aes_fixed_parallelism_1_input_test,
aes_fixed_parallelism_2_inputs_test,
};
#[cfg(feature = "gpu")]
pub(crate) use crate::integer::server_key::radix_parallel::tests_unsigned::test_neg::default_neg_test;
pub(crate) use crate::integer::server_key::radix_parallel::tests_unsigned::test_neg::unchecked_neg_test;
#[cfg(feature = "gpu")]

View File

@@ -1,5 +1,6 @@
mod modulus_switch_compression;
pub(crate) mod test_add;
pub(crate) mod test_aes;
pub(crate) mod test_bitwise_op;
mod test_block_rotate;
mod test_block_shift;

View File

@@ -0,0 +1,225 @@
#![cfg(feature = "gpu")]
use crate::integer::keycache::KEY_CACHE;
use crate::integer::server_key::radix_parallel::tests_cases_unsigned::FunctionExecutor;
use crate::integer::{IntegerKeyKind, RadixCiphertext, RadixClientKey};
use crate::shortint::parameters::TestParameters;
use std::sync::Arc;
const S_BOX: [u8; 256] = [
0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
];
fn plain_key_expansion(key: u128) -> Vec<u128> {
const RCON: [u32; 10] = [
0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000, 0x40000000,
0x80000000, 0x1B000000, 0x36000000,
];
let mut words = [0u32; 44];
for (i, word) in words.iter_mut().enumerate().take(4) {
*word = (key >> (96 - (i * 32))) as u32;
}
for i in 4..44 {
let mut temp = words[i - 1];
if i % 4 == 0 {
temp = temp.rotate_left(8);
let mut sub_bytes = 0u32;
for j in 0..4 {
let byte = (temp >> (24 - j * 8)) as u8;
sub_bytes |= (S_BOX[byte as usize] as u32) << (24 - j * 8);
}
temp = sub_bytes ^ RCON[i / 4 - 1];
}
words[i] = words[i - 4] ^ temp;
}
words
.chunks_exact(4)
.map(|chunk| {
((chunk[0] as u128) << 96)
| ((chunk[1] as u128) << 64)
| ((chunk[2] as u128) << 32)
| (chunk[3] as u128)
})
.collect()
}
fn sub_bytes(state: &mut [u8; 16]) {
for byte in state.iter_mut() {
*byte = S_BOX[*byte as usize];
}
}
fn shift_rows(state: &mut [u8; 16]) {
let original = *state;
state[1] = original[5];
state[5] = original[9];
state[9] = original[13];
state[13] = original[1];
state[2] = original[10];
state[6] = original[14];
state[10] = original[2];
state[14] = original[6];
state[3] = original[15];
state[7] = original[3];
state[11] = original[7];
state[15] = original[11];
}
fn gmul(mut a: u8, mut b: u8) -> u8 {
let mut p = 0;
for _ in 0..8 {
if (b & 1) != 0 {
p ^= a;
}
let hi_bit_set = (a & 0x80) != 0;
a <<= 1;
if hi_bit_set {
a ^= 0x1B;
}
b >>= 1;
}
p
}
fn mix_columns(state: &mut [u8; 16]) {
let original = *state;
for i in 0..4 {
let col = i * 4;
state[col] = gmul(original[col], 2)
^ gmul(original[col + 1], 3)
^ original[col + 2]
^ original[col + 3];
state[col + 1] = original[col]
^ gmul(original[col + 1], 2)
^ gmul(original[col + 2], 3)
^ original[col + 3];
state[col + 2] = original[col]
^ original[col + 1]
^ gmul(original[col + 2], 2)
^ gmul(original[col + 3], 3);
state[col + 3] = gmul(original[col], 3)
^ original[col + 1]
^ original[col + 2]
^ gmul(original[col + 3], 2);
}
}
fn add_round_key(state: &mut [u8; 16], round_key: u128) {
let key_bytes = round_key.to_be_bytes();
for i in 0..16 {
state[i] ^= key_bytes[i];
}
}
fn plain_aes_encrypt_block(block_bytes: &mut [u8; 16], expanded_keys: &[u128]) {
add_round_key(block_bytes, expanded_keys[0]);
for round_key in expanded_keys.iter().take(10).skip(1) {
sub_bytes(block_bytes);
shift_rows(block_bytes);
mix_columns(block_bytes);
add_round_key(block_bytes, *round_key);
}
sub_bytes(block_bytes);
shift_rows(block_bytes);
add_round_key(block_bytes, expanded_keys[10]);
}
fn plain_aes_ctr(num_aes_inputs: usize, iv: u128, key: u128) -> Vec<u128> {
let expanded_keys = plain_key_expansion(key);
let mut results = Vec::with_capacity(num_aes_inputs);
for i in 0..num_aes_inputs {
let counter_value = iv.wrapping_add(i as u128);
let mut block = counter_value.to_be_bytes();
plain_aes_encrypt_block(&mut block, &expanded_keys);
results.push(u128::from_be_bytes(block));
}
results
}
fn internal_aes_fixed_parallelism_test<P, E>(param: P, mut executor: E, num_aes_inputs: usize)
where
P: Into<TestParameters>,
E: for<'a> FunctionExecutor<
(&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
RadixCiphertext,
>,
{
let param = param.into();
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let cks = RadixClientKey::from((cks, 1));
let sks = Arc::new(sks);
executor.setup(&cks, sks);
let key: u128 = 0x2b7e151628aed2a6abf7158809cf4f3c;
let iv: u128 = 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff;
let plain_results = plain_aes_ctr(num_aes_inputs, iv, key);
let ctxt_key = cks.encrypt_u128_for_aes_ctr(key);
let ctxt_iv = cks.encrypt_u128_for_aes_ctr(iv);
for sbox_parallelism in [1, 2, 4, 8, 16] {
let encrypted_result =
executor.execute((&ctxt_key, &ctxt_iv, 0, num_aes_inputs, sbox_parallelism));
let fhe_results = cks.decrypt_u128_from_aes_ctr(&encrypted_result, num_aes_inputs);
assert_eq!(fhe_results, plain_results);
}
}
pub fn aes_fixed_parallelism_1_input_test<P, E>(param: P, executor: E)
where
P: Into<TestParameters>,
E: for<'a> FunctionExecutor<
(&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
RadixCiphertext,
>,
{
internal_aes_fixed_parallelism_test(param, executor, 1);
}
pub fn aes_fixed_parallelism_2_inputs_test<P, E>(param: P, executor: E)
where
P: Into<TestParameters>,
E: for<'a> FunctionExecutor<
(&'a RadixCiphertext, &'a RadixCiphertext, u128, usize, usize),
RadixCiphertext,
>,
{
internal_aes_fixed_parallelism_test(param, executor, 2);
}
pub fn aes_dynamic_parallelism_many_inputs_test<P, E>(param: P, mut executor: E)
where
P: Into<TestParameters>,
E: for<'a> FunctionExecutor<
(&'a RadixCiphertext, &'a RadixCiphertext, u128, usize),
RadixCiphertext,
>,
{
let param = param.into();
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let cks = RadixClientKey::from((cks, 1));
let sks = Arc::new(sks);
executor.setup(&cks, sks);
let key: u128 = 0x2b7e151628aed2a6abf7158809cf4f3c;
let iv: u128 = 0xf0f1f2f3f4f5f6f7f8f9fafbfcfdfeff;
let ctxt_key = cks.encrypt_u128_for_aes_ctr(key);
let ctxt_iv = cks.encrypt_u128_for_aes_ctr(iv);
for num_aes_inputs in [4, 8, 16, 32] {
let plain_results = plain_aes_ctr(num_aes_inputs, iv, key);
let encrypted_result = executor.execute((&ctxt_key, &ctxt_iv, 0, num_aes_inputs));
let fhe_results = cks.decrypt_u128_from_aes_ctr(&encrypted_result, num_aes_inputs);
assert_eq!(fhe_results, plain_results);
}
}