mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-13 08:38:03 -05:00
Compare commits
1 Commits
pa/paralle
...
al/vectori
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
18084fb995 |
@@ -56,6 +56,7 @@ typedef struct {
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t max_num_radix_blocks;
|
||||
uint32_t lwe_dimension;
|
||||
uint32_t num_radix_ciphertexts;
|
||||
} CudaRadixCiphertextFFI;
|
||||
|
||||
typedef struct {
|
||||
|
||||
@@ -24,33 +24,43 @@ __host__ void host_integer_radix_bitop_kb(
|
||||
lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks,
|
||||
"Cuda error: input and output num radix blocks must be equal");
|
||||
|
||||
PANIC_IF_FALSE(
|
||||
lwe_array_out->num_radix_ciphertexts ==
|
||||
lwe_array_1->num_radix_ciphertexts &&
|
||||
lwe_array_out->num_radix_ciphertexts ==
|
||||
lwe_array_2->num_radix_ciphertexts,
|
||||
"Cuda error: input and output num radix ciphertexts must be equal");
|
||||
|
||||
PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension &&
|
||||
lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension,
|
||||
"Cuda error: input and output lwe dimension must be equal");
|
||||
|
||||
auto lut = mem_ptr->lut;
|
||||
uint64_t degrees[lwe_array_1->num_radix_blocks];
|
||||
uint64_t degrees[lwe_array_1->num_radix_blocks *
|
||||
lwe_array_1->num_radix_ciphertexts];
|
||||
if (mem_ptr->op == BITOP_TYPE::BITAND) {
|
||||
update_degrees_after_bitand(degrees, lwe_array_1->degrees,
|
||||
lwe_array_2->degrees,
|
||||
lwe_array_1->num_radix_blocks);
|
||||
update_degrees_after_bitand(
|
||||
degrees, lwe_array_1->degrees, lwe_array_2->degrees,
|
||||
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
|
||||
} else if (mem_ptr->op == BITOP_TYPE::BITOR) {
|
||||
update_degrees_after_bitor(degrees, lwe_array_1->degrees,
|
||||
lwe_array_2->degrees,
|
||||
lwe_array_1->num_radix_blocks);
|
||||
update_degrees_after_bitor(
|
||||
degrees, lwe_array_1->degrees, lwe_array_2->degrees,
|
||||
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
|
||||
} else if (mem_ptr->op == BITOP_TYPE::BITXOR) {
|
||||
update_degrees_after_bitxor(degrees, lwe_array_1->degrees,
|
||||
lwe_array_2->degrees,
|
||||
lwe_array_1->num_radix_blocks);
|
||||
update_degrees_after_bitxor(
|
||||
degrees, lwe_array_1->degrees, lwe_array_2->degrees,
|
||||
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts);
|
||||
}
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, lwe_array_out, lwe_array_1, lwe_array_2, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, lwe_array_out->num_radix_blocks,
|
||||
ms_noise_reduction_key, lut,
|
||||
lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts,
|
||||
lut->params.message_modulus);
|
||||
|
||||
memcpy(lwe_array_out->degrees, degrees,
|
||||
lwe_array_out->num_radix_blocks * sizeof(uint64_t));
|
||||
lwe_array_out->num_radix_blocks *
|
||||
lwe_array_out->num_radix_ciphertexts * sizeof(uint64_t));
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -417,9 +417,12 @@ __host__ void host_pack_bivariate_blocks(
|
||||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
|
||||
PANIC("Cuda error: input and output radix ciphertexts should have the same "
|
||||
"lwe dimension")
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
|
||||
num_radix_blocks > lwe_array_1->num_radix_blocks ||
|
||||
num_radix_blocks > lwe_array_2->num_radix_blocks)
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks *
|
||||
lwe_array_out->num_radix_ciphertexts ||
|
||||
num_radix_blocks >
|
||||
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts ||
|
||||
num_radix_blocks >
|
||||
lwe_array_2->num_radix_blocks * lwe_array_2->num_radix_ciphertexts)
|
||||
PANIC("Cuda error: num radix blocks on which packing is applied should be "
|
||||
"smaller or equal to the number of input & output radix blocks")
|
||||
|
||||
@@ -530,7 +533,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
if (num_radix_blocks > lut->num_blocks)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of lut radix blocks")
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks)
|
||||
if (num_radix_blocks >
|
||||
lwe_array_out->num_radix_blocks * lwe_array_out->num_radix_ciphertexts)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of input & output radix blocks")
|
||||
|
||||
@@ -756,11 +760,14 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
if (num_radix_blocks > lut->num_blocks)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of lut radix blocks")
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks ||
|
||||
num_radix_blocks > lwe_array_1->num_radix_blocks ||
|
||||
num_radix_blocks > lwe_array_2->num_radix_blocks)
|
||||
if (num_radix_blocks > lwe_array_out->num_radix_blocks *
|
||||
lwe_array_out->num_radix_ciphertexts ||
|
||||
num_radix_blocks >
|
||||
lwe_array_1->num_radix_blocks * lwe_array_1->num_radix_ciphertexts ||
|
||||
num_radix_blocks >
|
||||
lwe_array_2->num_radix_blocks * lwe_array_2->num_radix_ciphertexts)
|
||||
PANIC("Cuda error: num radix blocks on which lut is applied should be "
|
||||
"smaller or equal to the number of input & output radix blocks")
|
||||
"smaller or equal to the number of total input & output radix blocks")
|
||||
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
|
||||
@@ -25,6 +25,7 @@ void into_radix_ciphertext(CudaRadixCiphertextFFI *radix, void *lwe_array,
|
||||
radix->num_radix_blocks = num_radix_blocks;
|
||||
radix->max_num_radix_blocks = num_radix_blocks;
|
||||
radix->ptr = lwe_array;
|
||||
radix->num_radix_ciphertexts = 1;
|
||||
|
||||
radix->degrees = (uint64_t *)(calloc(num_radix_blocks, sizeof(uint64_t)));
|
||||
radix->noise_levels =
|
||||
|
||||
@@ -19,6 +19,7 @@ void create_zero_radix_ciphertext_async(cudaStream_t const stream,
|
||||
radix->lwe_dimension = lwe_dimension;
|
||||
radix->num_radix_blocks = num_radix_blocks;
|
||||
radix->max_num_radix_blocks = num_radix_blocks;
|
||||
radix->num_radix_ciphertexts = 1;
|
||||
uint64_t size = (lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
|
||||
radix->ptr = (void *)cuda_malloc_with_size_tracking_async(
|
||||
size, stream, gpu_index, size_tracker, allocate_gpu_memory);
|
||||
@@ -63,6 +64,7 @@ void as_radix_ciphertext_slice(CudaRadixCiphertextFFI *output_radix,
|
||||
|
||||
auto lwe_size = input_radix->lwe_dimension + 1;
|
||||
output_radix->num_radix_blocks = end_input_lwe_index - start_input_lwe_index;
|
||||
output_radix->num_radix_ciphertexts = input_radix->num_radix_ciphertexts;
|
||||
output_radix->max_num_radix_blocks = input_radix->max_num_radix_blocks;
|
||||
output_radix->lwe_dimension = input_radix->lwe_dimension;
|
||||
Torus *in_ptr = (Torus *)input_radix->ptr;
|
||||
|
||||
@@ -183,6 +183,7 @@ pub struct CudaRadixCiphertextFFI {
|
||||
pub num_radix_blocks: u32,
|
||||
pub max_num_radix_blocks: u32,
|
||||
pub lwe_dimension: u32,
|
||||
pub num_radix_ciphertexts: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
@@ -201,6 +202,8 @@ const _: () = {
|
||||
[::std::mem::offset_of!(CudaRadixCiphertextFFI, max_num_radix_blocks) - 28usize];
|
||||
["Offset of field: CudaRadixCiphertextFFI::lwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaRadixCiphertextFFI, lwe_dimension) - 32usize];
|
||||
["Offset of field: CudaRadixCiphertextFFI::num_radix_ciphertexts"]
|
||||
[::std::mem::offset_of!(CudaRadixCiphertextFFI, num_radix_ciphertexts) - 36usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
|
||||
@@ -90,6 +90,12 @@ path = "benches/high_level_api/noise_squash.rs"
|
||||
harness = false
|
||||
required-features = ["integer", "internal-keycache"]
|
||||
|
||||
[[bench]]
|
||||
name = "hlapi-arrays"
|
||||
path = "benches/high_level_api/arrays.rs"
|
||||
harness = false
|
||||
required-features = ["integer", "internal-keycache"]
|
||||
|
||||
[[bench]]
|
||||
name = "glwe_packing_compression-integer-bench"
|
||||
path = "benches/integer/glwe_packing_compression.rs"
|
||||
|
||||
63
tfhe-benchmark/benches/high_level_api/arrays.rs
Normal file
63
tfhe-benchmark/benches/high_level_api/arrays.rs
Normal file
@@ -0,0 +1,63 @@
|
||||
use benchmark::params_aliases::BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use benchmark::utilities::{write_to_json, OperatorType};
|
||||
use criterion::Criterion;
|
||||
use rand::prelude::*;
|
||||
use tfhe::array::GpuFheUint64Array;
|
||||
use tfhe::keycache::NamedParam;
|
||||
use tfhe::prelude::*;
|
||||
use tfhe::{ClientKey, CompressedServerKey};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
fn main() {
|
||||
let cks = {
|
||||
use tfhe::{set_server_key, ConfigBuilder};
|
||||
let config = ConfigBuilder::with_custom_parameters(
|
||||
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.build();
|
||||
let cks = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&cks);
|
||||
|
||||
set_server_key(compressed_sks.decompress_to_gpu());
|
||||
cks
|
||||
};
|
||||
|
||||
let array_dim = 32;
|
||||
let num_elems = array_dim * array_dim;
|
||||
let mut rng = thread_rng();
|
||||
let clear_xs = (0..num_elems as u64)
|
||||
.map(|_| rng.gen::<u64>())
|
||||
.collect::<Vec<_>>();
|
||||
let clear_ys = (0..num_elems as u64)
|
||||
.map(|_| rng.gen::<u64>())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let xs =
|
||||
GpuFheUint64Array::try_encrypt((clear_xs.as_slice(), vec![array_dim, array_dim]), &cks)
|
||||
.unwrap();
|
||||
let ys =
|
||||
GpuFheUint64Array::try_encrypt((clear_ys.as_slice(), vec![array_dim, array_dim]), &cks)
|
||||
.unwrap();
|
||||
|
||||
let mut c = Criterion::default().configure_from_args();
|
||||
let bench_id = format!("bench::hlapi::array::cuda::bitand::");
|
||||
c.bench_function(&bench_id, |b| {
|
||||
b.iter(|| {
|
||||
let _ = &xs & &ys;
|
||||
})
|
||||
});
|
||||
|
||||
let params = cks.computation_parameters();
|
||||
|
||||
write_to_json::<u64, _>(
|
||||
&bench_id,
|
||||
params,
|
||||
params.name(),
|
||||
"erc20-transfer",
|
||||
&OperatorType::Atomic,
|
||||
64,
|
||||
vec![],
|
||||
);
|
||||
|
||||
c.final_summary();
|
||||
}
|
||||
@@ -104,12 +104,19 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
|
||||
.map(|list| list.0.lwe_ciphertext_count.0)
|
||||
.sum(),
|
||||
);
|
||||
|
||||
assert_ne!(
|
||||
lwe_ciphertext_count.0, 0,
|
||||
"Empty iterator of CudaLweCiphertextList"
|
||||
);
|
||||
|
||||
let stream_count = lwe_ciphertext_count.0.min(6);
|
||||
let mut new_streams: Vec<CudaStreams> = Vec::with_capacity(stream_count);
|
||||
|
||||
for _ in 0..stream_count {
|
||||
let stream = CudaStreams::new_single_gpu(streams.gpu_indexes[0]);
|
||||
new_streams.push(stream);
|
||||
}
|
||||
|
||||
let first_item = cuda_ciphertexts_list_vec.next().unwrap();
|
||||
let lwe_dimension = first_item.lwe_dimension();
|
||||
let mut d_vec = CudaVec::new(
|
||||
@@ -123,25 +130,20 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
|
||||
* std::mem::size_of::<T>();
|
||||
// Concatenate gpu_index memory
|
||||
unsafe {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
ptr,
|
||||
first_item.0.d_vec.as_c_ptr(0),
|
||||
size as u64,
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
);
|
||||
ptr = ptr.wrapping_byte_add(size);
|
||||
for list in cuda_ciphertexts_list_vec {
|
||||
for (i, list) in cuda_ciphertexts_list_vec.enumerate() {
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
ptr,
|
||||
list.0.d_vec.as_c_ptr(0),
|
||||
size as u64,
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
new_streams[i % stream_count].ptr[0],
|
||||
new_streams[i % stream_count].gpu_indexes[0].get(),
|
||||
);
|
||||
ptr = ptr.wrapping_byte_add(size);
|
||||
}
|
||||
}
|
||||
for s in new_streams.iter() {
|
||||
s.synchronize();
|
||||
}
|
||||
|
||||
let cuda_lwe_list = CudaLweList {
|
||||
d_vec,
|
||||
|
||||
@@ -840,6 +840,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
|
||||
num_radix_blocks: num_samples,
|
||||
max_num_radix_blocks: num_samples,
|
||||
lwe_dimension: lwe_dimension.0 as u32,
|
||||
num_radix_ciphertexts: 1u32,
|
||||
};
|
||||
let lwe_array_in_1_data = CudaRadixCiphertextFFI {
|
||||
ptr: lwe_array_in_1.get_mut_c_ptr(0),
|
||||
@@ -848,6 +849,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
|
||||
num_radix_blocks: num_samples,
|
||||
max_num_radix_blocks: num_samples,
|
||||
lwe_dimension: lwe_dimension.0 as u32,
|
||||
num_radix_ciphertexts: 1u32,
|
||||
};
|
||||
let lwe_array_in_2_data = CudaRadixCiphertextFFI {
|
||||
ptr: lwe_array_in_2.get_mut_c_ptr(0),
|
||||
@@ -856,6 +858,7 @@ pub unsafe fn add_lwe_ciphertext_vector_async<T: UnsignedInteger>(
|
||||
num_radix_blocks: num_samples,
|
||||
max_num_radix_blocks: num_samples,
|
||||
lwe_dimension: lwe_dimension.0 as u32,
|
||||
num_radix_ciphertexts: 1u32,
|
||||
};
|
||||
cuda_add_lwe_ciphertext_vector_64(
|
||||
streams.ptr[0],
|
||||
@@ -890,6 +893,7 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
|
||||
num_radix_blocks: num_samples,
|
||||
max_num_radix_blocks: num_samples,
|
||||
lwe_dimension: lwe_dimension.0 as u32,
|
||||
num_radix_ciphertexts: 1u32,
|
||||
};
|
||||
let lwe_array_in_data = CudaRadixCiphertextFFI {
|
||||
ptr: lwe_array_in.get_mut_c_ptr(0),
|
||||
@@ -898,6 +902,7 @@ pub unsafe fn add_lwe_ciphertext_vector_assign_async<T: UnsignedInteger>(
|
||||
num_radix_blocks: num_samples,
|
||||
max_num_radix_blocks: num_samples,
|
||||
lwe_dimension: lwe_dimension.0 as u32,
|
||||
num_radix_ciphertexts: 1u32,
|
||||
};
|
||||
cuda_add_lwe_ciphertext_vector_64(
|
||||
streams.ptr[0],
|
||||
|
||||
@@ -19,7 +19,8 @@ use crate::integer::block_decomposition::{
|
||||
DecomposableInto, RecomposableFrom, RecomposableSignedInteger,
|
||||
};
|
||||
use crate::integer::gpu::ciphertext::{
|
||||
CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
|
||||
CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
|
||||
CudaUnsignedRadixCiphertext,
|
||||
};
|
||||
use crate::integer::server_key::radix_parallel::scalar_div_mod::SignedReciprocable;
|
||||
use crate::integer::server_key::{Reciprocable, ScalarMultiplier};
|
||||
@@ -83,6 +84,12 @@ impl<'a, T> TensorSlice<'a, GpuSlice<'a, T>> {
|
||||
pub fn par_iter(self) -> ParStridedIter<'a, T> {
|
||||
ParStridedIter::new(self.slice.0, self.dims.clone())
|
||||
}
|
||||
pub fn len(&self) -> usize {
|
||||
self.dims.flattened_len()
|
||||
}
|
||||
pub fn as_slice(&self) -> &'a [T] {
|
||||
self.slice.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> TensorSlice<'a, GpuSliceMut<'a, T>> {
|
||||
@@ -316,7 +323,25 @@ where
|
||||
lhs: TensorSlice<'_, Self::Slice<'a>>,
|
||||
rhs: TensorSlice<'_, Self::Slice<'a>>,
|
||||
) -> Self::Owned {
|
||||
par_map_sks_op_on_pair_of_elements(lhs, rhs, crate::integer::gpu::CudaServerKey::bitand)
|
||||
GpuOwned(global_state::with_cuda_internal_keys(|cuda_key| {
|
||||
let streams = &cuda_key.streams;
|
||||
let num_ciphertexts = lhs.len() as u32;
|
||||
let lhs_slice: &[T] = lhs.as_slice();
|
||||
let rhs_slice: &[T] = rhs.as_slice();
|
||||
let mut lhs_aligned = T::from(CudaRadixCiphertext::from_radix_ciphertext_vec(
|
||||
lhs_slice, streams,
|
||||
));
|
||||
let rhs_aligned = T::from(CudaRadixCiphertext::from_radix_ciphertext_vec(
|
||||
rhs_slice, streams,
|
||||
));
|
||||
crate::integer::gpu::CudaServerKey::bitand_vec(
|
||||
cuda_key.pbs_key(),
|
||||
&mut lhs_aligned,
|
||||
&rhs_aligned,
|
||||
num_ciphertexts,
|
||||
streams,
|
||||
)
|
||||
}))
|
||||
}
|
||||
|
||||
fn bitor<'a>(
|
||||
|
||||
@@ -28,6 +28,12 @@ impl<'a, T> TensorSlice<'a, &'a [T]> {
|
||||
pub fn par_iter(self) -> ParStridedIter<'a, T> {
|
||||
ParStridedIter::new(self.slice, self.dims.clone())
|
||||
}
|
||||
pub fn len(&self) -> usize {
|
||||
self.dims.flattened_len()
|
||||
}
|
||||
pub fn as_slice(&self) -> &'a [T] {
|
||||
self.slice
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> TensorSlice<'a, &'a mut [T]> {
|
||||
|
||||
@@ -7,7 +7,7 @@ pub mod squashed_noise;
|
||||
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::vec::CudaVec;
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::core_crypto::gpu::{CudaLweList, CudaStreams};
|
||||
use crate::core_crypto::prelude::{LweCiphertextList, LweCiphertextOwned};
|
||||
use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
|
||||
use crate::integer::parameters::LweDimension;
|
||||
@@ -15,6 +15,7 @@ use crate::integer::{IntegerCiphertext, RadixCiphertext, SignedRadixCiphertext};
|
||||
use crate::shortint::{Ciphertext, EncryptionKeyChoice};
|
||||
use crate::GpuIndex;
|
||||
|
||||
use crate::shortint::parameters::LweCiphertextCount;
|
||||
pub use compressed_noise_squashed_ciphertext_list::*;
|
||||
|
||||
pub trait CudaIntegerRadixCiphertext: Sized {
|
||||
@@ -70,8 +71,68 @@ pub trait CudaIntegerRadixCiphertext: Sized {
|
||||
fn gpu_indexes(&self) -> &[GpuIndex] {
|
||||
&self.as_ref().d_blocks.0.d_vec.gpu_indexes
|
||||
}
|
||||
|
||||
// Converts a CudaIntegerRadixCiphertext with num_blocks * num_ciphertexts LWEs into a
|
||||
// Vec<CudaIntegerRadixCiphertext> of length num_radix_ciphertexts, where each ciphertext has
|
||||
// num_blocks LWEs
|
||||
fn to_integer_radix_ciphertext_vec(
|
||||
&self,
|
||||
num_radix_ciphertexts: u32,
|
||||
streams: &CudaStreams,
|
||||
) -> Vec<Self> {
|
||||
let total_blocks = self.as_ref().d_blocks.0.lwe_ciphertext_count.0;
|
||||
assert_eq!(total_blocks % num_radix_ciphertexts as usize, 0, "Total number of blocks ({total_blocks}) is not divisible by number of radix ciphertexts ({num_radix_ciphertexts})");
|
||||
|
||||
let num_blocks = total_blocks / num_radix_ciphertexts as usize;
|
||||
|
||||
let mut result = Vec::with_capacity(num_radix_ciphertexts as usize);
|
||||
let lwe_dimension = self.as_ref().d_blocks.lwe_dimension();
|
||||
|
||||
for i in 0..num_radix_ciphertexts as usize {
|
||||
let block_start = i * num_blocks;
|
||||
let block_end = block_start + num_blocks;
|
||||
|
||||
let d_vec = unsafe {
|
||||
let mut d_vec =
|
||||
CudaVec::new_async(lwe_dimension.to_lwe_size().0 * num_blocks, streams, 0);
|
||||
|
||||
let copy_start = block_start * lwe_dimension.to_lwe_size().0;
|
||||
let copy_end = block_end * lwe_dimension.to_lwe_size().0;
|
||||
d_vec.copy_src_range_gpu_to_gpu_async(
|
||||
copy_start..copy_end,
|
||||
&self.as_ref().d_blocks.0.d_vec,
|
||||
streams,
|
||||
0,
|
||||
);
|
||||
|
||||
streams.synchronize();
|
||||
d_vec
|
||||
};
|
||||
let lwe_list = CudaLweList::<u64> {
|
||||
d_vec,
|
||||
lwe_ciphertext_count: LweCiphertextCount(num_blocks),
|
||||
lwe_dimension,
|
||||
ciphertext_modulus: self.as_ref().d_blocks.ciphertext_modulus(),
|
||||
};
|
||||
|
||||
// Copy the associated block metadata
|
||||
let block_info = self.as_ref().info.blocks[block_start..block_end].to_vec();
|
||||
|
||||
let info = CudaRadixCiphertextInfo { blocks: block_info };
|
||||
|
||||
result.push(Self::from(CudaRadixCiphertext::new(
|
||||
CudaLweCiphertextList(lwe_list),
|
||||
info,
|
||||
)));
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
/// This struct corresponds to the pointers on GPU and
|
||||
/// metadata representing an array of LWEs corresponding
|
||||
/// to one or more RadixCiphertexts
|
||||
pub struct CudaRadixCiphertext {
|
||||
pub d_blocks: CudaLweCiphertextList<u64>,
|
||||
pub info: CudaRadixCiphertextInfo,
|
||||
|
||||
@@ -183,6 +183,25 @@ fn prepare_cuda_radix_ffi(
|
||||
num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32,
|
||||
max_num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32,
|
||||
lwe_dimension: input.d_blocks.0.lwe_dimension.0 as u32,
|
||||
num_radix_ciphertexts: 1u32,
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_cuda_radix_vec_ffi(
|
||||
input: &CudaRadixCiphertext,
|
||||
degrees_vec: &mut Vec<u64>,
|
||||
noise_levels_vec: &mut Vec<u64>,
|
||||
num_radix_ciphertexts: u32,
|
||||
) -> CudaRadixCiphertextFFI {
|
||||
CudaRadixCiphertextFFI {
|
||||
ptr: input.d_blocks.0.d_vec.get_mut_c_ptr(0),
|
||||
degrees: degrees_vec.as_mut_ptr(),
|
||||
noise_levels: noise_levels_vec.as_mut_ptr(),
|
||||
num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32 / num_radix_ciphertexts,
|
||||
max_num_radix_blocks: input.d_blocks.0.lwe_ciphertext_count.0 as u32
|
||||
/ num_radix_ciphertexts,
|
||||
lwe_dimension: input.d_blocks.0.lwe_dimension.0 as u32,
|
||||
num_radix_ciphertexts,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,6 +219,7 @@ fn prepare_cuda_radix_ffi_from_slice<T: UnsignedInteger>(
|
||||
num_radix_blocks,
|
||||
max_num_radix_blocks: num_radix_blocks,
|
||||
lwe_dimension,
|
||||
num_radix_ciphertexts: 1u32,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -217,6 +237,7 @@ fn prepare_cuda_radix_ffi_from_slice_mut<T: UnsignedInteger>(
|
||||
num_radix_blocks,
|
||||
max_num_radix_blocks: num_radix_blocks,
|
||||
lwe_dimension,
|
||||
num_radix_ciphertexts: 1u32,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7604,3 +7625,143 @@ pub unsafe fn expand_async<T: UnsignedInteger, B: Numeric>(
|
||||
);
|
||||
cleanup_expand_without_verification_64(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
/// # Safety
|
||||
///
|
||||
/// This operation modifies raw GPU pointers on the GPU
|
||||
pub unsafe fn unchecked_bitop_vec_radix_kb_assign<T: UnsignedInteger, B: Numeric>(
|
||||
streams: &CudaStreams,
|
||||
radix_lwe_left: &mut CudaRadixCiphertext,
|
||||
radix_lwe_right: &CudaRadixCiphertext,
|
||||
bootstrapping_key: &CudaVec<B>,
|
||||
keyswitch_key: &CudaVec<T>,
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
glwe_dimension: GlweDimension,
|
||||
polynomial_size: PolynomialSize,
|
||||
big_lwe_dimension: LweDimension,
|
||||
small_lwe_dimension: LweDimension,
|
||||
ks_level: DecompositionLevelCount,
|
||||
ks_base_log: DecompositionBaseLog,
|
||||
pbs_level: DecompositionLevelCount,
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
op: BitOpType,
|
||||
num_blocks: u32,
|
||||
num_radix_ciphertexts: u32,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
radix_lwe_left.d_blocks.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first lhs pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
radix_lwe_left.d_blocks.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
radix_lwe_right.d_blocks.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first rhs pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
radix_lwe_right.d_blocks.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
bootstrapping_key.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
bootstrapping_key.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
keyswitch_key.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ct_modulus = radix_lwe_left
|
||||
.d_blocks
|
||||
.ciphertext_modulus()
|
||||
.raw_modulus_float();
|
||||
let (noise_reduction_type, ms_noise_reduction_key_ffi) =
|
||||
resolve_ms_noise_reduction_config(ms_noise_reduction_configuration, ct_modulus);
|
||||
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut radix_lwe_left_degrees = radix_lwe_left
|
||||
.info
|
||||
.blocks
|
||||
.iter()
|
||||
.map(|b| b.degree.0)
|
||||
.collect();
|
||||
let mut radix_lwe_left_noise_levels = radix_lwe_left
|
||||
.info
|
||||
.blocks
|
||||
.iter()
|
||||
.map(|b| b.noise_level.0)
|
||||
.collect();
|
||||
let mut cuda_ffi_radix_lwe_left = prepare_cuda_radix_vec_ffi(
|
||||
radix_lwe_left,
|
||||
&mut radix_lwe_left_degrees,
|
||||
&mut radix_lwe_left_noise_levels,
|
||||
num_radix_ciphertexts,
|
||||
);
|
||||
// Here even though the input is not modified, data is passed as mutable.
|
||||
// This avoids having to create two structs for the CudaRadixCiphertext pointers,
|
||||
// one const and the other mutable.
|
||||
// Having two structs on the Cuda side complicates things as we need to be sure we pass the
|
||||
// Const structure as input instead of the mutable structure, which leads to complicated
|
||||
// data manipulation on the C++ side to change mutability of data.
|
||||
let mut radix_lwe_right_degrees = radix_lwe_right
|
||||
.info
|
||||
.blocks
|
||||
.iter()
|
||||
.map(|b| b.degree.0)
|
||||
.collect();
|
||||
let mut radix_lwe_right_noise_levels = radix_lwe_right
|
||||
.info
|
||||
.blocks
|
||||
.iter()
|
||||
.map(|b| b.noise_level.0)
|
||||
.collect();
|
||||
let cuda_ffi_radix_lwe_right = prepare_cuda_radix_vec_ffi(
|
||||
radix_lwe_right,
|
||||
&mut radix_lwe_right_degrees,
|
||||
&mut radix_lwe_right_noise_levels,
|
||||
num_radix_ciphertexts,
|
||||
);
|
||||
scratch_cuda_integer_radix_bitop_kb_64(
|
||||
streams.ffi(),
|
||||
std::ptr::addr_of_mut!(mem_ptr),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
big_lwe_dimension.0 as u32,
|
||||
small_lwe_dimension.0 as u32,
|
||||
ks_level.0 as u32,
|
||||
ks_base_log.0 as u32,
|
||||
pbs_level.0 as u32,
|
||||
pbs_base_log.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
num_blocks * num_radix_ciphertexts,
|
||||
message_modulus.0 as u32,
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
op as u32,
|
||||
true,
|
||||
noise_reduction_type as u32,
|
||||
);
|
||||
cuda_bitop_integer_radix_ciphertext_kb_64(
|
||||
streams.ffi(),
|
||||
&raw mut cuda_ffi_radix_lwe_left,
|
||||
&raw const cuda_ffi_radix_lwe_left,
|
||||
&raw const cuda_ffi_radix_lwe_right,
|
||||
mem_ptr,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&raw const ms_noise_reduction_key_ffi,
|
||||
);
|
||||
cleanup_cuda_integer_bitop(streams.ffi(), std::ptr::addr_of_mut!(mem_ptr));
|
||||
update_noise_degree(radix_lwe_left, &cuda_ffi_radix_lwe_left);
|
||||
streams.synchronize();
|
||||
}
|
||||
|
||||
@@ -10,7 +10,8 @@ use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
get_bitop_integer_radix_kb_size_on_gpu, get_full_propagate_assign_size_on_gpu,
|
||||
unchecked_bitop_integer_radix_kb_assign_async, BitOpType, CudaServerKey, PBSType,
|
||||
unchecked_bitop_integer_radix_kb_assign_async, unchecked_bitop_vec_radix_kb_assign, BitOpType,
|
||||
CudaServerKey, PBSType,
|
||||
};
|
||||
|
||||
impl CudaServerKey {
|
||||
@@ -977,4 +978,142 @@ impl CudaServerKey {
|
||||
let bitnot_mem = (lwe_ciphertext_count.0 * size_of::<u64>()) as u64;
|
||||
full_prop_mem.max(bitnot_mem)
|
||||
}
|
||||
|
||||
pub fn unchecked_bitop_vec_assign<T: CudaIntegerRadixCiphertext>(
|
||||
&self,
|
||||
ct_left: &mut T,
|
||||
ct_right: &T,
|
||||
op: BitOpType,
|
||||
num_radix_ciphertexts: u32,
|
||||
streams: &CudaStreams,
|
||||
) {
|
||||
assert_eq!(
|
||||
ct_left.as_ref().d_blocks.lwe_dimension(),
|
||||
ct_right.as_ref().d_blocks.lwe_dimension()
|
||||
);
|
||||
assert_eq!(
|
||||
ct_left.as_ref().d_blocks.lwe_ciphertext_count(),
|
||||
ct_right.as_ref().d_blocks.lwe_ciphertext_count()
|
||||
);
|
||||
|
||||
let num_blocks =
|
||||
ct_left.as_ref().d_blocks.lwe_ciphertext_count().0 as u32 / num_radix_ciphertexts;
|
||||
|
||||
unsafe {
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_bitop_vec_radix_kb_assign(
|
||||
streams,
|
||||
ct_left.as_mut(),
|
||||
ct_right.as_ref(),
|
||||
&d_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
op,
|
||||
num_blocks,
|
||||
num_radix_ciphertexts,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.ms_noise_reduction_configuration.as_ref(),
|
||||
);
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
|
||||
unchecked_bitop_vec_radix_kb_assign(
|
||||
streams,
|
||||
ct_left.as_mut(),
|
||||
ct_right.as_ref(),
|
||||
&d_multibit_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
d_multibit_bsk.glwe_dimension,
|
||||
d_multibit_bsk.polynomial_size,
|
||||
self.key_switching_key
|
||||
.input_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_multibit_bsk.decomp_level_count,
|
||||
d_multibit_bsk.decomp_base_log,
|
||||
op,
|
||||
num_blocks,
|
||||
num_radix_ciphertexts,
|
||||
PBSType::MultiBit,
|
||||
d_multibit_bsk.grouping_factor,
|
||||
None,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unchecked_bitand_vec<T: CudaIntegerRadixCiphertext>(
|
||||
&self,
|
||||
ct_left: &T,
|
||||
ct_right: &T,
|
||||
num_radix_ciphertexts: u32,
|
||||
streams: &CudaStreams,
|
||||
) -> T {
|
||||
let mut result = unsafe { ct_left.duplicate_async(streams) };
|
||||
self.unchecked_bitop_vec_assign(
|
||||
&mut result,
|
||||
ct_right,
|
||||
BitOpType::And,
|
||||
num_radix_ciphertexts,
|
||||
streams,
|
||||
);
|
||||
result
|
||||
}
|
||||
pub fn bitand_vec<T: CudaIntegerRadixCiphertext>(
|
||||
&self,
|
||||
ct_left: &mut T,
|
||||
ct_right: &T,
|
||||
num_radix_ciphertexts: u32,
|
||||
streams: &CudaStreams,
|
||||
) -> Vec<T> {
|
||||
let mut tmp_rhs;
|
||||
|
||||
let (lhs, rhs) = unsafe {
|
||||
match (
|
||||
ct_left.block_carries_are_empty(),
|
||||
ct_right.block_carries_are_empty(),
|
||||
) {
|
||||
(true, true) => (ct_left, ct_right),
|
||||
(true, false) => {
|
||||
tmp_rhs = ct_right.duplicate_async(streams);
|
||||
self.full_propagate_assign_async(&mut tmp_rhs, streams);
|
||||
(ct_left, &tmp_rhs)
|
||||
}
|
||||
(false, true) => {
|
||||
self.full_propagate_assign_async(ct_left, streams);
|
||||
(ct_left, ct_right)
|
||||
}
|
||||
(false, false) => {
|
||||
tmp_rhs = ct_right.duplicate_async(streams);
|
||||
|
||||
self.full_propagate_assign_async(ct_left, streams);
|
||||
self.full_propagate_assign_async(&mut tmp_rhs, streams);
|
||||
(ct_left, &tmp_rhs)
|
||||
}
|
||||
}
|
||||
};
|
||||
let result = self.unchecked_bitand_vec(lhs, rhs, num_radix_ciphertexts, streams);
|
||||
result.to_integer_radix_ciphertext_vec(num_radix_ciphertexts, streams)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user