feat(gpu): implement fhe rand on gpu

This commit is contained in:
Guillermo Oyarzun
2025-01-13 16:26:52 +01:00
parent 7a8efb1934
commit a9e4724178
19 changed files with 1181 additions and 144 deletions

View File

@@ -5,45 +5,50 @@
extern "C" {
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_negate_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in_1, void const *lwe_array_in_2,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in_1, void const *lwe_array_in_2,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count);
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_plaintext_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint64_t plaintext_in,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count);
}
#endif // CUDA_LINALG_H_

View File

@@ -4,12 +4,11 @@
* Perform the addition of two u32 input LWE ciphertext vectors.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
void cuda_add_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in_1, void const *lwe_array_in_2,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_addition<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
@@ -44,12 +43,11 @@ void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
* vectors are left unchanged. This function is a wrapper to a device function
* that performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
void cuda_add_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in_1, void const *lwe_array_in_2,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_addition<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
@@ -65,7 +63,8 @@ void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext<uint32_t>(
static_cast<cudaStream_t>(stream), gpu_index,
@@ -105,7 +104,8 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *plaintext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
@@ -114,3 +114,41 @@ void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
static_cast<const uint64_t *>(plaintext_array_in), input_lwe_dimension,
input_lwe_ciphertext_count);
}
/*
* Perform the addition of a u64 input LWE ciphertext vector with a u64 input
* plaintext scalar.
* - `stream` is a void pointer to the Cuda stream to be used in the kernel
* launch
* - `gpu_index` is the index of the GPU to be used in the kernel launch
* - `lwe_array_out` is an array of size
* `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
* been allocated on the GPU before calling this function, and that will hold
* the result of the computation.
* - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
* been allocated and initialized before calling this function. It has the same
* size as the output array.
* - `plaintext_in` is the plaintext used as input.
* - `input_lwe_dimension` is the number of mask elements in the input and
* output LWE ciphertext vectors
* - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
* input LWE ciphertext vector, as well as in the output.
*
* The same input plaintext is added to the body of the
* LWE ciphertexts in the LWE ciphertext vector. The result of the
* operation is stored in the output LWE ciphertext vector. The two input
* vectors are unchanged. This function is a wrapper to a device function that
* performs the operation on the GPU.
*/
void cuda_add_lwe_ciphertext_vector_plaintext_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint64_t plaintext_in,
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_addition_plaintext_scalar<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_array_in), plaintext_in,
input_lwe_dimension, input_lwe_ciphertext_count);
}

View File

@@ -13,9 +13,10 @@
#include <stdio.h>
template <typename T>
__global__ void
plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
uint32_t input_lwe_dimension, uint32_t num_entries) {
__global__ void plaintext_addition(T *output, T const *lwe_input,
T const *plaintext_input,
const uint32_t input_lwe_dimension,
const uint32_t num_entries) {
int tid = threadIdx.x;
int plaintext_index = blockIdx.x * blockDim.x + tid;
@@ -28,10 +29,26 @@ plaintext_addition(T *output, T const *lwe_input, T const *plaintext_input,
}
template <typename T>
__host__ void
host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
T const *lwe_input, T const *plaintext_input,
uint32_t lwe_dimension, uint32_t lwe_ciphertext_count) {
__global__ void plaintext_addition_scalar(T *output, T const *lwe_input,
const T plaintext_input,
const uint32_t input_lwe_dimension,
const uint32_t num_entries) {
int tid = threadIdx.x;
int lwe_index = blockIdx.x * blockDim.x + tid;
if (lwe_index < num_entries) {
int index = lwe_index * (input_lwe_dimension + 1) + input_lwe_dimension;
// Here we take advantage of the wrapping behaviour of uint
output[index] = lwe_input[index] + plaintext_input;
}
}
template <typename T>
__host__ void host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index,
T *output, T const *lwe_input,
T const *plaintext_input,
const uint32_t lwe_dimension,
const uint32_t lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
int num_blocks = 0, num_threads = 0;
@@ -48,6 +65,27 @@ host_addition_plaintext(cudaStream_t stream, uint32_t gpu_index, T *output,
check_cuda_error(cudaGetLastError());
}
template <typename T>
__host__ void host_addition_plaintext_scalar(
cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
const T plaintext_input, const uint32_t lwe_dimension,
const uint32_t lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
int num_blocks = 0, num_threads = 0;
int num_entries = lwe_ciphertext_count;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cuda_memcpy_async_gpu_to_gpu(
output, lwe_input, (lwe_dimension + 1) * lwe_ciphertext_count * sizeof(T),
stream, gpu_index);
plaintext_addition_scalar<T><<<grid, thds, 0, stream>>>(
output, lwe_input, plaintext_input, lwe_dimension, num_entries);
check_cuda_error(cudaGetLastError());
}
template <typename T>
__global__ void addition(T *output, T const *input_1, T const *input_2,
uint32_t num_entries) {
@@ -64,8 +102,8 @@ __global__ void addition(T *output, T const *input_1, T const *input_2,
template <typename T>
__host__ void host_addition(cudaStream_t stream, uint32_t gpu_index, T *output,
T const *input_1, T const *input_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body

View File

@@ -7,7 +7,8 @@
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_cleartext_vec_multiplication<uint32_t>(
static_cast<cudaStream_t>(stream), gpu_index,
@@ -47,7 +48,8 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *cleartext_array_in,
uint32_t input_lwe_dimension, uint32_t input_lwe_ciphertext_count) {
const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_cleartext_vec_multiplication<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,

View File

@@ -16,8 +16,8 @@
template <typename T>
__global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
T const *cleartext_input,
uint32_t input_lwe_dimension,
uint32_t num_entries) {
const uint32_t input_lwe_dimension,
const uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
@@ -31,8 +31,8 @@ __global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
template <typename T>
__host__ void host_cleartext_vec_multiplication(
cudaStream_t stream, uint32_t gpu_index, T *output, T const *lwe_input,
T const *cleartext_input, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
T const *cleartext_input, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body

View File

@@ -4,11 +4,10 @@
* Perform the negation of a u32 input LWE ciphertext vector.
* See the equivalent operation on u64 ciphertexts for more details.
*/
void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
void cuda_negate_lwe_ciphertext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_negation<uint32_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint32_t *>(lwe_array_out),
@@ -38,11 +37,10 @@ void cuda_negate_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
* LWE ciphertext vector is left unchanged. This function is a wrapper to a
* device function that performs the operation on the GPU.
*/
void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
void cuda_negate_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
host_negation<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),

View File

@@ -23,8 +23,8 @@ __global__ void negation(T *output, T const *input, uint32_t num_entries) {
template <typename T>
__host__ void host_negation(cudaStream_t stream, uint32_t gpu_index, T *output,
T const *input, uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
T const *input, const uint32_t input_lwe_dimension,
const uint32_t input_lwe_ciphertext_count) {
cudaSetDevice(gpu_index);
// lwe_size includes the presence of the body

View File

@@ -1345,6 +1345,17 @@ extern "C" {
input_lwe_ciphertext_count: u32,
);
}
extern "C" {
pub fn cuda_add_lwe_ciphertext_vector_plaintext_64(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_in: *const ffi::c_void,
plaintext_in: u64,
input_lwe_dimension: u32,
input_lwe_ciphertext_count: u32,
);
}
extern "C" {
pub fn cuda_fourier_polynomial_mul(
stream: *mut ffi::c_void,

View File

@@ -1307,11 +1307,12 @@ define_server_key_bench_default_fn!(
#[cfg(feature = "gpu")]
mod cuda {
use super::*;
use criterion::criterion_group;
use criterion::{black_box, criterion_group};
use tfhe::core_crypto::gpu::CudaStreams;
use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
use tfhe::integer::gpu::server_key::CudaServerKey;
use tfhe_csprng::seeders::Seed;
fn bench_cuda_server_key_unary_function_clean_inputs<F>(
c: &mut Criterion,
@@ -1731,6 +1732,84 @@ mod cuda {
bench_group.finish()
}
pub fn cuda_unsigned_oprf(c: &mut Criterion) {
let bench_name = "integer::cuda::unsigned_oprf";
let mut bench_group = c.benchmark_group(bench_name);
bench_group
.sample_size(15)
.measurement_time(std::time::Duration::from_secs(30));
let streams = CudaStreams::new_multi_gpu();
for (param, num_block, bit_size) in ParamsAndNumBlocksIter::default() {
let param_name = param.name();
let bench_id;
match BENCH_TYPE.get().unwrap() {
BenchmarkType::Latency => {
bench_id = format!("{bench_name}::{param_name}::{bit_size}_bits");
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &streams);
b.iter(|| {
_ = black_box(
gpu_sks
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&streams,
),
);
})
});
}
BenchmarkType::Throughput => {
bench_id = format!("{bench_name}::throughput::{param_name}::{bit_size}_bits");
let elements = throughput_num_threads(num_block);
bench_group.throughput(Throughput::Elements(elements));
bench_group.bench_function(&bench_id, |b| {
let (cks, _cpu_sks) =
KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
let gpu_sks = CudaServerKey::new(&cks, &streams);
b.iter(|| {
(0..elements).into_par_iter().for_each(|i| {
let selected_gpu =
streams.gpu_indexes[i as usize % streams.gpu_indexes.len()];
let stream = CudaStreams::new_single_gpu(selected_gpu);
gpu_sks
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
Seed(0),
bit_size as u64,
num_block as u64,
&stream,
);
})
})
});
}
}
write_to_json::<u64, _>(
&bench_id,
param,
param.name(),
"oprf",
&OperatorType::Atomic,
bit_size as u32,
vec![param.message_modulus().0.ilog2(); num_block],
);
}
bench_group.finish()
}
macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
(method_name: $server_key_method:ident, display_name:$name:ident) => {
::paste::paste!{
@@ -2376,6 +2455,7 @@ mod cuda {
cuda_trailing_zeros,
cuda_trailing_ones,
cuda_ilog2,
cuda_unsigned_oprf,
);
criterion_group!(
@@ -2395,6 +2475,7 @@ mod cuda {
cuda_scalar_mul,
cuda_scalar_div,
cuda_scalar_rem,
cuda_unsigned_oprf,
);
criterion_group!(

View File

@@ -481,6 +481,31 @@ pub unsafe fn add_lwe_ciphertext_vector_plaintext_vector_async<T: UnsignedIntege
);
}
/// Addition of a vector of LWE ciphertexts with a plaintext scalar
///
/// # Safety
///
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
pub unsafe fn add_lwe_ciphertext_vector_plaintext_scalar_async<T: UnsignedInteger>(
streams: &CudaStreams,
lwe_array_out: &mut CudaVec<T>,
lwe_array_in: &CudaVec<T>,
plaintext_in: u64,
lwe_dimension: LweDimension,
num_samples: u32,
) {
cuda_add_lwe_ciphertext_vector_plaintext_64(
streams.ptr[0],
streams.gpu_indexes[0].0,
lwe_array_out.as_mut_c_ptr(0),
lwe_array_in.as_c_ptr(0),
plaintext_in,
lwe_dimension.0 as u32,
num_samples,
);
}
/// Assigned addition of a vector of LWE ciphertexts with a vector of plaintexts
///
/// # Safety

View File

@@ -1,6 +1,12 @@
use super::FheBool;
use super::{FheBool, InnerBoolean};
use crate::high_level_api::global_state;
#[cfg(feature = "gpu")]
use crate::high_level_api::global_state::with_thread_local_cuda_streams;
use crate::high_level_api::keys::InternalServerKey;
#[cfg(feature = "gpu")]
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
#[cfg(feature = "gpu")]
use crate::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
use crate::integer::BooleanBlock;
use tfhe_csprng::seeders::Seed;
@@ -24,16 +30,28 @@ impl FheBool {
/// let dec_result: bool = ct_res.decrypt(&client_key);
/// ```
pub fn generate_oblivious_pseudo_random(seed: Seed) -> Self {
global_state::with_internal_keys(|key| match key {
let (ciphertext, tag) = global_state::with_internal_keys(|key| match key {
InternalServerKey::Cpu(key) => {
let ct = key.pbs_key().key.generate_oblivious_pseudo_random(seed, 1);
Self::new(BooleanBlock(ct), key.tag.clone())
(
InnerBoolean::Cpu(BooleanBlock::new_unchecked(ct)),
key.tag.clone(),
)
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
todo!("Cuda devices do not yet support oblivious pseudo random generation")
}
})
InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
let d_ct: CudaUnsignedRadixCiphertext = cuda_key
.key
.key
.generate_oblivious_pseudo_random(seed, 1, streams);
(
InnerBoolean::Cuda(CudaBooleanBlock::from_cuda_radix_ciphertext(
d_ct.ciphertext,
)),
cuda_key.tag.clone(),
)
}),
});
Self::new(ciphertext, tag)
}
}

View File

@@ -1,8 +1,11 @@
use super::{FheIntId, FheUintId};
use super::{FheIntId, FheUint, FheUintId};
use crate::high_level_api::global_state;
#[cfg(feature = "gpu")]
use crate::high_level_api::global_state::with_thread_local_cuda_streams;
use crate::high_level_api::keys::InternalServerKey;
use crate::{FheInt, FheUint, Seed};
#[cfg(feature = "gpu")]
use crate::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
use crate::{FheInt, Seed};
impl<Id: FheUintId> FheUint<Id> {
/// Generates an encrypted unsigned integer
/// taken uniformly in its full range using the given seed.
@@ -35,9 +38,18 @@ impl<Id: FheUintId> FheUint<Id> {
Self::new(ct, key.tag.clone())
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
todo!("Cuda devices do not yet support oblivious pseudo random generation")
}
InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
let d_ct: CudaUnsignedRadixCiphertext = cuda_key
.key
.key
.par_generate_oblivious_pseudo_random_unsigned_integer(
seed,
Id::num_blocks(cuda_key.message_modulus()) as u64,
streams,
);
Self::new(d_ct, cuda_key.tag.clone())
}),
})
}
/// Generates an encrypted `num_block` blocks unsigned integer
@@ -75,9 +87,18 @@ impl<Id: FheUintId> FheUint<Id> {
Self::new(ct, key.tag.clone())
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
todo!("Cuda devices do not yet support oblivious pseudo random generation")
}
InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
let d_ct: CudaUnsignedRadixCiphertext = cuda_key
.key
.key
.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
seed,
random_bits_count,
Id::num_blocks(cuda_key.message_modulus()) as u64,
streams,
);
Self::new(d_ct, cuda_key.tag.clone())
}),
})
}
}
@@ -115,9 +136,18 @@ impl<Id: FheIntId> FheInt<Id> {
Self::new(ct, key.tag.clone())
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
todo!("Cuda devices do not yet support oblivious pseudo random generation")
}
InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
let d_ct: CudaSignedRadixCiphertext = cuda_key
.key
.key
.par_generate_oblivious_pseudo_random_signed_integer(
seed,
Id::num_blocks(cuda_key.message_modulus()) as u64,
streams,
);
Self::new(d_ct, cuda_key.tag.clone())
}),
})
}
@@ -157,9 +187,18 @@ impl<Id: FheIntId> FheInt<Id> {
Self::new(ct, key.tag.clone())
}
#[cfg(feature = "gpu")]
InternalServerKey::Cuda(_) => {
todo!("Cuda devices do not yet support oblivious pseudo random generation")
}
InternalServerKey::Cuda(cuda_key) => with_thread_local_cuda_streams(|streams| {
let d_ct: CudaSignedRadixCiphertext = cuda_key
.key
.key
.par_generate_oblivious_pseudo_random_signed_integer_bounded(
seed,
random_bits_count,
Id::num_blocks(cuda_key.message_modulus()) as u64,
streams,
);
Self::new(d_ct, cuda_key.tag.clone())
}),
})
}
}

View File

@@ -20,7 +20,9 @@ use crate::integer::gpu::{
};
use crate::integer::server_key::radix_parallel::OutputFlag;
use crate::shortint::ciphertext::{Degree, NoiseLevel};
use crate::shortint::engine::{fill_accumulator, fill_many_lut_accumulator};
use crate::shortint::engine::{
fill_accumulator, fill_accumulator_no_encoding, fill_many_lut_accumulator,
};
use crate::shortint::server_key::{
BivariateLookupTableOwned, LookupTableOwned, ManyLookupTableOwned,
};
@@ -36,6 +38,7 @@ mod even_odd;
mod ilog2;
mod mul;
mod neg;
mod oprf;
mod rotate;
mod scalar_add;
mod scalar_bitwise_op;
@@ -805,6 +808,29 @@ impl CudaServerKey {
degree: Degree::new(max_value),
}
}
pub(crate) fn generate_lookup_table_no_encode<F>(&self, f: F) -> LookupTableOwned
where
F: Fn(u64) -> u64,
{
let (glwe_size, polynomial_size) = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
(d_bsk.glwe_dimension.to_glwe_size(), d_bsk.polynomial_size)
}
CudaBootstrappingKey::MultiBit(d_bsk) => {
(d_bsk.glwe_dimension.to_glwe_size(), d_bsk.polynomial_size)
}
};
let mut acc = GlweCiphertext::new(0, glwe_size, polynomial_size, self.ciphertext_modulus);
fill_accumulator_no_encoding(&mut acc, polynomial_size, glwe_size, f);
LookupTableOwned {
acc,
// We should not rely on the degree in this case
// The degree should be set manually on the outputs of PBS by this LUT
degree: Degree::new(self.message_modulus.0 * self.carry_modulus.0 * 2),
}
}
pub fn generate_many_lookup_table(
&self,

View File

@@ -0,0 +1,736 @@
use crate::core_crypto::gpu::CudaStreams;
use crate::integer::gpu::ciphertext::{
CudaIntegerRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
};
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::core_crypto::commons::generators::DeterministicSeeder;
use crate::core_crypto::prelude::DefaultRandomGenerator;
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
use crate::shortint::oprf::create_random_from_seed_modulus_switched;
use crate::shortint::server_key::LookupTableOwned;
pub use tfhe_csprng::seeders::{Seed, Seeder};
use crate::core_crypto::gpu::{
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext,
cuda_programmable_bootstrap_lwe_ciphertext,
};
use crate::core_crypto::commons::numeric::Numeric;
use crate::core_crypto::gpu::add_lwe_ciphertext_vector_plaintext_scalar_async;
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
use crate::core_crypto::prelude::CastInto;
use crate::integer::gpu::server_key::radix::{CudaLweCiphertextList, LweCiphertextCount};
use crate::integer::gpu::CudaVec;
use itertools::Itertools;
impl CudaServerKey {
/// Generates an encrypted `num_block` blocks unsigned integer
/// taken uniformly in its full range using the given seed.
/// The encryted value is oblivious to the server.
/// It can be useful to make server random generation deterministic.
///
/// ```rust
/// use tfhe::core_crypto::gpu::CudaStreams;
/// use tfhe::core_crypto::gpu::vec::GpuIndex;
/// use tfhe::integer::gpu::gen_keys_gpu;
/// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
/// use tfhe::Seed;
///
/// let size = 4;
/// let gpu_index = 0;
/// let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
///
/// // Generate the client key and the server key:
/// let (cks, sks) = gen_keys_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, &streams);
///
/// let d_ct_res = sks.par_generate_oblivious_pseudo_random_unsigned_integer(Seed(0), size as u64, &streams);
/// let ct_res = d_ct_res.to_radix_ciphertext(&streams);
/// // Decrypt:
/// let dec_result: u64 = cks.decrypt_radix(&ct_res);
///
/// assert!(dec_result < 1 << (2 * size));
/// ```
pub fn par_generate_oblivious_pseudo_random_unsigned_integer(
&self,
seed: Seed,
num_blocks: u64,
streams: &CudaStreams,
) -> CudaUnsignedRadixCiphertext {
assert!(self.message_modulus.0.is_power_of_two());
let range_log_size = self.message_modulus.0.ilog2() as u64 * num_blocks;
let random_bits_count = range_log_size;
assert!(self.message_modulus.0.is_power_of_two());
let mut streams_vector = Vec::<CudaStreams>::with_capacity(num_blocks as usize);
for _ in 0..num_blocks {
streams_vector.push(CudaStreams::new_single_gpu(streams.gpu_indexes[0]));
}
let message_bits_count = self.message_modulus.0.ilog2() as u64;
let mut deterministic_seeder = DeterministicSeeder::<DefaultRandomGenerator>::new(seed);
let seeds: Vec<Seed> = (0..num_blocks)
.map(|_| deterministic_seeder.seed())
.collect();
let blocks = seeds
.into_par_iter()
.enumerate()
.map(|(i, seed)| {
let stream_index = i;
let i = i as u64;
if i * message_bits_count < random_bits_count {
// if we generate 5 bits of noise in n blocks of 2 bits, the third (i=2) block
// must have only one bit of random
if random_bits_count < (i + 1) * message_bits_count {
let top_message_bits_count = random_bits_count - i * message_bits_count;
assert!(top_message_bits_count <= message_bits_count);
let ct: CudaUnsignedRadixCiphertext = self
.generate_oblivious_pseudo_random(
seed,
top_message_bits_count,
&streams_vector[stream_index],
);
ct.ciphertext
} else {
let ct: CudaUnsignedRadixCiphertext = self
.generate_oblivious_pseudo_random(
seed,
message_bits_count,
&streams_vector[stream_index],
);
ct.ciphertext
}
} else {
let ct: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(1, &streams_vector[stream_index]);
ct.ciphertext
}
})
.collect::<Vec<_>>();
self.convert_radixes_vec_to_single_radix_ciphertext(&blocks, streams)
}
/// Generates an encrypted `num_block` blocks unsigned integer
/// taken uniformly in `[0, 2^random_bits_count[` using the given seed.
/// The encryted value is oblivious to the server.
/// It can be useful to make server random generation deterministic.
///
/// ```rust
/// use tfhe::core_crypto::gpu::CudaStreams;
/// use tfhe::core_crypto::gpu::vec::GpuIndex;
/// use tfhe::integer::gpu::gen_keys_gpu;
/// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
/// use tfhe::Seed;
///
/// let gpu_index = 0;
/// let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
/// let size = 4;
///
/// // Generate the client key and the server key:
/// let (cks, sks) = gen_keys_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, &streams);
///
/// let random_bits_count = 3;
///
/// let d_ct_res = sks.par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
/// Seed(0),
/// random_bits_count,
/// size as u64,
/// &streams,
/// );
/// let ct_res = d_ct_res.to_radix_ciphertext(&streams);
/// // Decrypt:
/// let dec_result: u64 = cks.decrypt_radix(&ct_res);
/// assert!(dec_result < (1 << random_bits_count));
/// ```
pub fn par_generate_oblivious_pseudo_random_unsigned_integer_bounded(
&self,
seed: Seed,
random_bits_count: u64,
num_blocks: u64,
streams: &CudaStreams,
) -> CudaUnsignedRadixCiphertext {
assert!(self.message_modulus.0.is_power_of_two());
let range_log_size = self.message_modulus.0.ilog2() as u64 * num_blocks;
assert!(
random_bits_count <= range_log_size,
"The range asked for a random value (=[0, 2^{random_bits_count}[) does not fit in the available range [0, 2^{range_log_size}[",
);
assert!(self.message_modulus.0.is_power_of_two());
let mut streams_vector = Vec::<CudaStreams>::with_capacity(num_blocks as usize);
for _ in 0..num_blocks {
streams_vector.push(CudaStreams::new_single_gpu(streams.gpu_indexes[0]));
}
let message_bits_count = self.message_modulus.0.ilog2() as u64;
let mut deterministic_seeder = DeterministicSeeder::<DefaultRandomGenerator>::new(seed);
let seeds: Vec<Seed> = (0..num_blocks)
.map(|_| deterministic_seeder.seed())
.collect();
let blocks = seeds
.into_par_iter()
.enumerate()
.map(|(i, seed)| {
let stream_index = i;
let i = i as u64;
if i * message_bits_count < random_bits_count {
// if we generate 5 bits of noise in n blocks of 2 bits, the third (i=2) block
// must have only one bit of random
if random_bits_count < (i + 1) * message_bits_count {
let top_message_bits_count = random_bits_count - i * message_bits_count;
assert!(top_message_bits_count <= message_bits_count);
let ct: CudaUnsignedRadixCiphertext = self
.generate_oblivious_pseudo_random(
seed,
top_message_bits_count,
&streams_vector[stream_index],
);
ct.ciphertext
} else {
let ct: CudaUnsignedRadixCiphertext = self
.generate_oblivious_pseudo_random(
seed,
message_bits_count,
&streams_vector[stream_index],
);
ct.ciphertext
}
} else {
let ct: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(1, &streams_vector[stream_index]);
ct.ciphertext
}
})
.collect::<Vec<_>>();
self.convert_radixes_vec_to_single_radix_ciphertext(&blocks, streams)
}
/// Generates an encrypted `num_block` blocks signed integer
/// taken uniformly in its full range using the given seed.
/// The encryted value is oblivious to the server.
/// It can be useful to make server random generation deterministic.
///
/// ```rust
/// use tfhe::core_crypto::gpu::CudaStreams;
/// use tfhe::core_crypto::gpu::vec::GpuIndex;
/// use tfhe::integer::gpu::gen_keys_gpu;
/// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
/// use tfhe::Seed;
///
/// let gpu_index = 0;
/// let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
/// let size = 4;
///
/// // Generate the client key and the server key:
/// let (cks, sks) = gen_keys_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, &streams);
///
/// let d_ct_res = sks.par_generate_oblivious_pseudo_random_signed_integer(Seed(0), size as u64, &streams);
/// let ct_res = d_ct_res.to_signed_radix_ciphertext(&streams);
///
/// // Decrypt:
/// let dec_result: i64 = cks.decrypt_signed_radix(&ct_res);
/// assert!(dec_result < 1 << (2 * size - 1));
/// assert!(dec_result >= -(1 << (2 * size - 1)));
/// ```
pub fn par_generate_oblivious_pseudo_random_signed_integer(
&self,
seed: Seed,
num_blocks: u64,
streams: &CudaStreams,
) -> CudaSignedRadixCiphertext {
assert!(self.message_modulus.0.is_power_of_two());
let message_bits_count = self.message_modulus.0.ilog2() as u64;
let mut streams_vector = Vec::<CudaStreams>::with_capacity(num_blocks as usize);
for _ in 0..num_blocks {
streams_vector.push(CudaStreams::new_single_gpu(streams.gpu_indexes[0]));
}
let mut deterministic_seeder = DeterministicSeeder::<DefaultRandomGenerator>::new(seed);
let seeds: Vec<Seed> = (0..num_blocks)
.map(|_| deterministic_seeder.seed())
.collect();
let blocks = seeds
.into_par_iter()
.enumerate()
.map(|(i, seed)| {
let stream_index = i;
let ct: CudaSignedRadixCiphertext = self.generate_oblivious_pseudo_random(
seed,
message_bits_count,
&streams_vector[stream_index],
);
ct.ciphertext
})
.collect::<Vec<_>>();
self.convert_radixes_vec_to_single_radix_ciphertext(&blocks, streams)
}
/// Generates an encrypted `num_block` blocks signed integer
/// taken uniformly in `[0, 2^random_bits_count[` using the given seed.
/// The encryted value is oblivious to the server.
/// It can be useful to make server random generation deterministic.
///
/// ```rust
/// use tfhe::core_crypto::gpu::CudaStreams;
/// use tfhe::core_crypto::gpu::vec::GpuIndex;
/// use tfhe::integer::gpu::gen_keys_gpu;
/// use tfhe::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
/// use tfhe::Seed;
///
/// let gpu_index = 0;
/// let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
/// let size = 4;
///
/// // Generate the client key and the server key:
/// let (cks, sks) = gen_keys_gpu(PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64, &streams);
///
/// let random_bits_count = 3;
///
/// let d_ct_res = sks.par_generate_oblivious_pseudo_random_signed_integer_bounded(
/// Seed(0),
/// random_bits_count,
/// size as u64,
/// &streams,
/// );
/// let ct_res = d_ct_res.to_signed_radix_ciphertext(&streams);
///
/// // Decrypt:
/// let dec_result: i64 = cks.decrypt_signed_radix(&ct_res);
/// assert!(dec_result >= 0);
/// assert!(dec_result < (1 << random_bits_count));
/// ```
pub fn par_generate_oblivious_pseudo_random_signed_integer_bounded(
&self,
seed: Seed,
random_bits_count: u64,
num_blocks: u64,
streams: &CudaStreams,
) -> CudaSignedRadixCiphertext {
assert!(self.message_modulus.0.is_power_of_two());
let range_log_size = self.message_modulus.0.ilog2() as u64 * num_blocks;
#[allow(clippy::int_plus_one)]
{
assert!(
random_bits_count + 1 <= range_log_size,
"The range asked for a random value (=[0, 2^{}[) does not fit in the available range [-2^{}, 2^{}[",
random_bits_count, range_log_size-1, range_log_size-1,
);
}
assert!(self.message_modulus.0.is_power_of_two());
let mut streams_vector = Vec::<CudaStreams>::with_capacity(num_blocks as usize);
for _ in 0..num_blocks {
streams_vector.push(CudaStreams::new_single_gpu(streams.gpu_indexes[0]));
}
let message_bits_count = self.message_modulus.0.ilog2() as u64;
let mut deterministic_seeder = DeterministicSeeder::<DefaultRandomGenerator>::new(seed);
let seeds = (0..num_blocks).map(|_| deterministic_seeder.seed());
let blocks = seeds
.into_iter()
.enumerate()
.map(|(i, seed)| {
let stream_index = i;
let i = i as u64;
if i * message_bits_count < random_bits_count {
// if we generate 5 bits of noise in n blocks of 2 bits, the third (i=2)
// block must have only one bit of random
if random_bits_count < (i + 1) * message_bits_count {
let top_message_bits_count = random_bits_count - i * message_bits_count;
assert!(top_message_bits_count <= message_bits_count);
let ct: CudaUnsignedRadixCiphertext = self
.generate_oblivious_pseudo_random(
seed,
top_message_bits_count,
&streams_vector[stream_index],
);
ct.ciphertext
} else {
let ct: CudaUnsignedRadixCiphertext = self
.generate_oblivious_pseudo_random(
seed,
message_bits_count,
&streams_vector[stream_index],
);
ct.ciphertext
}
} else {
let ct: CudaUnsignedRadixCiphertext =
self.create_trivial_zero_radix(1, &streams_vector[stream_index]);
ct.ciphertext
}
})
.collect::<Vec<_>>();
self.convert_radixes_vec_to_single_radix_ciphertext(&blocks, streams)
}
/// Uniformly generates a random encrypted value in `[0, 2^random_bits_count[`
/// `2^random_bits_count` must be smaller than the message modulus
/// The encryted value is oblivious to the server
pub fn generate_oblivious_pseudo_random<T>(
&self,
seed: Seed,
random_bits_count: u64,
streams: &CudaStreams,
) -> T
where
T: CudaIntegerRadixCiphertext,
{
assert!(
1 << random_bits_count <= self.message_modulus.0,
"The range asked for a random value (=[0, 2^{}[) does not fit in the available range [0, {}[",
random_bits_count, self.message_modulus.0
);
self.generate_oblivious_pseudo_random_message_and_carry(seed, random_bits_count, streams)
}
/// Uniformly generates a random value in `[0, 2^random_bits_count[`
/// The encryted value is oblivious to the server
pub(crate) fn generate_oblivious_pseudo_random_message_and_carry<T>(
&self,
seed: Seed,
random_bits_count: u64,
streams: &CudaStreams,
) -> T
where
T: CudaIntegerRadixCiphertext,
{
assert!(
self.message_modulus.0.is_power_of_two(),
"The message modulus(={}), must be a power of 2 to use the OPRF",
self.message_modulus.0
);
let message_bits_count = self.message_modulus.0.ilog2() as u64;
assert!(
self.carry_modulus.0.is_power_of_two(),
"The carry modulus(={}), must be a power of 2 to use the OPRF",
self.carry_modulus.0
);
let carry_bits_count = self.carry_modulus.0.ilog2() as u64;
assert!(
random_bits_count <= carry_bits_count + message_bits_count,
"The number of random bits asked for (={random_bits_count}) is bigger than carry_bits_count (={carry_bits_count}) + message_bits_count(={message_bits_count})",
);
self.generate_oblivious_pseudo_random_custom_encoding(
seed,
random_bits_count,
1 + carry_bits_count + message_bits_count,
streams,
)
}
/// Uniformly generates a random encrypted value in `[0, 2^random_bits_count[`
/// The output in in the form 0000rrr000noise (rbc=3, fbc=7)
/// The encryted value is oblivious to the server
pub(crate) fn generate_oblivious_pseudo_random_custom_encoding<T>(
&self,
seed: Seed,
random_bits_count: u64,
full_bits_count: u64,
streams: &CudaStreams,
) -> T
where
T: CudaIntegerRadixCiphertext,
{
assert!(
random_bits_count <= full_bits_count,
"The number of random bits asked for (={random_bits_count}) is bigger than full_bits_count (={full_bits_count})"
);
let (in_lwe_size, out_lwe_dimension, polynomial_size) = match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => (
d_bsk.input_lwe_dimension().to_lwe_size(),
d_bsk.output_lwe_dimension(),
d_bsk.polynomial_size(),
),
CudaBootstrappingKey::MultiBit(d_bsk) => (
d_bsk.input_lwe_dimension().to_lwe_size(),
d_bsk.output_lwe_dimension(),
d_bsk.polynomial_size(),
),
};
let seeded = create_random_from_seed_modulus_switched(
seed,
in_lwe_size,
polynomial_size.to_blind_rotation_input_modulus_log(),
self.ciphertext_modulus,
);
let p = 1 << random_bits_count;
let delta = 1_u64 << (64 - full_bits_count);
let poly_delta = 2 * polynomial_size.0 as u64 / p;
let lut_no_encode: LookupTableOwned =
self.generate_lookup_table_no_encode(|x| (2 * (x / poly_delta) + 1) * delta / 2);
let num_ct_blocks = 1;
let ct_seeded = CudaLweCiphertextList::from_lwe_ciphertext(&seeded, streams);
let mut ct_out: T = self.create_trivial_zero_radix(num_ct_blocks, streams);
let number_of_messages = 1;
let d_accumulator =
CudaGlweCiphertextList::from_glwe_ciphertext(&lut_no_encode.acc, streams);
let mut lut_vector_indexes: Vec<u64> = vec![u64::ZERO; number_of_messages];
for (i, ind) in lut_vector_indexes.iter_mut().enumerate() {
*ind = <usize as CastInto<u64>>::cast_into(i);
}
let mut d_lut_vector_indexes =
unsafe { CudaVec::<u64>::new_async(number_of_messages, streams, 0) };
unsafe { d_lut_vector_indexes.copy_from_cpu_async(&lut_vector_indexes, streams, 0) };
let lwe_indexes_usize: Vec<usize> = (0..num_ct_blocks).collect_vec();
let lwe_indexes = lwe_indexes_usize
.iter()
.map(|&x| <usize as CastInto<u64>>::cast_into(x))
.collect_vec();
let mut d_output_indexes = unsafe { CudaVec::<u64>::new_async(num_ct_blocks, streams, 0) };
let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_ct_blocks, streams, 0) };
unsafe {
d_input_indexes.copy_from_cpu_async(&lwe_indexes, streams, 0);
d_output_indexes.copy_from_cpu_async(&lwe_indexes, streams, 0);
}
match &self.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => {
cuda_programmable_bootstrap_lwe_ciphertext(
&ct_seeded,
&mut ct_out.as_mut().d_blocks,
&d_accumulator,
&d_lut_vector_indexes,
&d_output_indexes,
&d_input_indexes,
LweCiphertextCount(num_ct_blocks),
d_bsk,
streams,
);
}
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => {
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext(
&ct_seeded,
&mut ct_out.as_mut().d_blocks,
&d_accumulator,
&d_lut_vector_indexes,
&d_output_indexes,
&d_input_indexes,
d_multibit_bsk,
streams,
);
}
}
let plaintext_to_add = (p - 1) * delta / 2;
let ct_cloned = ct_out.duplicate(streams);
unsafe {
add_lwe_ciphertext_vector_plaintext_scalar_async(
streams,
&mut ct_out.as_mut().d_blocks.0.d_vec,
&ct_cloned.as_ref().d_blocks.0.d_vec,
plaintext_to_add,
out_lwe_dimension,
num_ct_blocks as u32,
);
}
streams.synchronize();
ct_out
}
}
#[cfg(test)]
pub(crate) mod test {
use crate::core_crypto::gpu::vec::GpuIndex;
use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::decrypt_lwe_ciphertext;
use crate::integer::gpu::server_key::radix::CudaUnsignedRadixCiphertext;
use crate::integer::gpu::server_key::CudaBootstrappingKey;
use crate::integer::gpu::{gen_keys_gpu, CudaServerKey};
use crate::integer::{ClientKey, RadixCiphertext};
use crate::shortint::oprf::create_random_from_seed_modulus_switched;
use crate::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
use rayon::prelude::*;
use statrs::distribution::ContinuousCDF;
use std::collections::HashMap;
use tfhe_csprng::seeders::Seed;
fn square(a: f64) -> f64 {
a * a
}
#[test]
fn oprf_compare_plain_ci_run_filter() {
let gpu_index = 0;
let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
let (ck, gpu_sk) = gen_keys_gpu(
PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
&streams,
);
for seed in 0..1000 {
oprf_compare_plain_from_seed(Seed(seed), &ck, &gpu_sk, &streams);
}
}
fn oprf_compare_plain_from_seed(
seed: Seed,
ck: &ClientKey,
sk: &CudaServerKey,
streams: &CudaStreams,
) {
let params = ck.parameters();
let random_bits_count = 2;
let input_p = 2 * params.polynomial_size().0 as u64;
let log_input_p = input_p.ilog2();
let p_prime = 1 << random_bits_count;
let output_p = 2 * params.carry_modulus().0 * params.message_modulus().0;
let poly_delta = 2 * params.polynomial_size().0 as u64 / p_prime;
let d_img: CudaUnsignedRadixCiphertext =
sk.generate_oblivious_pseudo_random(seed, random_bits_count, streams);
let img: RadixCiphertext = d_img.to_radix_ciphertext(streams);
let (lwe_size, polynomial_size) = match &sk.bootstrapping_key {
CudaBootstrappingKey::Classic(d_bsk) => (
d_bsk.input_lwe_dimension().to_lwe_size(),
d_bsk.polynomial_size(),
),
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => (
d_multibit_bsk.input_lwe_dimension().to_lwe_size(),
d_multibit_bsk.polynomial_size(),
),
};
let ct = create_random_from_seed_modulus_switched(
seed,
lwe_size,
polynomial_size.to_blind_rotation_input_modulus_log(),
sk.ciphertext_modulus,
);
let sk = ck.key.small_lwe_secret_key();
let plain_prf_input = decrypt_lwe_ciphertext(&sk, &ct)
.0
.wrapping_add(1 << (64 - log_input_p - 1))
>> (64 - log_input_p);
let half_negacyclic_part = |x| 2 * (x / poly_delta) + 1;
let negacyclic_part = |x| {
assert!(x < input_p);
if x < input_p / 2 {
half_negacyclic_part(x)
} else {
2 * output_p - half_negacyclic_part(x - (input_p / 2))
}
};
let prf = |x| {
let a = (negacyclic_part(x) + p_prime - 1) % (2 * output_p);
assert!(a % 2 == 0);
a / 2
};
let expected_output = prf(plain_prf_input);
let output = ck.key.decrypt_message_and_carry(&img.blocks[0]);
assert!(output < p_prime);
assert_eq!(output, expected_output);
}
#[test]
fn oprf_test_uniformity_ci_run_filter() {
let sample_count: usize = 100_000;
let p_value_limit: f64 = 0.000_01;
let gpu_index = 0;
let streams = CudaStreams::new_single_gpu(GpuIndex(gpu_index));
let (ck, gpu_sk) = gen_keys_gpu(
PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
&streams,
);
let test_uniformity = |distinct_values: u64, f: &(dyn Fn(usize) -> u64 + Sync)| {
test_uniformity(sample_count, p_value_limit, distinct_values, f)
};
let random_bits_count = 2;
test_uniformity(1 << random_bits_count, &|seed| {
let d_img: CudaUnsignedRadixCiphertext = gpu_sk.generate_oblivious_pseudo_random(
Seed(seed as u128),
random_bits_count,
&streams,
);
let img: RadixCiphertext = d_img.to_radix_ciphertext(&streams);
ck.key.decrypt_message_and_carry(&img.blocks[0])
});
}
pub fn test_uniformity<F>(sample_count: usize, p_value_limit: f64, distinct_values: u64, f: F)
where
F: Sync + Fn(usize) -> u64,
{
let p_value = uniformity_p_value(f, sample_count, distinct_values);
assert!(
p_value_limit < p_value,
"p_value (={p_value}) expected to be bigger than {p_value_limit}"
);
}
fn uniformity_p_value<F>(f: F, sample_count: usize, distinct_values: u64) -> f64
where
F: Sync + Fn(usize) -> u64,
{
let values: Vec<_> = (0..sample_count).into_par_iter().map(&f).collect();
let mut values_count = HashMap::new();
for i in &values {
assert!(*i < distinct_values, "i {} dv{}", *i, distinct_values);
*values_count.entry(i).or_insert(0) += 1;
}
let single_expected_count = sample_count as f64 / distinct_values as f64;
// https://en.wikipedia.org/wiki/Pearson's_chi-squared_test
let distance: f64 = (0..distinct_values)
.map(|value| *values_count.get(&value).unwrap_or(&0))
.map(|count| square(count as f64 - single_expected_count) / single_expected_count)
.sum();
statrs::distribution::ChiSquared::new((distinct_values - 1) as f64)
.unwrap()
.sf(distance)
}
}

View File

@@ -3,7 +3,9 @@ use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::LweBskGroupingFactor;
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
use crate::integer::gpu::server_key::radix::CudaRadixCiphertext;
use crate::integer::gpu::server_key::radix::{
CudaBlockInfo, CudaRadixCiphertext, CudaRadixCiphertextInfo,
};
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
use crate::integer::gpu::{apply_bivariate_lut_kb_async, PBSType};
@@ -23,9 +25,16 @@ impl CudaServerKey {
.map(|ciphertext| &ciphertext.as_ref().d_blocks),
streams,
);
let vec_block_info: Vec<CudaBlockInfo> = radixes
.iter()
.flat_map(|ct| ct.as_ref().info.blocks.clone())
.collect();
let radix_info = CudaRadixCiphertextInfo {
blocks: vec_block_info,
};
CudaIntegerRadixCiphertext::from(CudaRadixCiphertext {
d_blocks: packed_list,
info: radixes[0].as_ref().info.clone(),
info: radix_info,
})
}

View File

@@ -3,6 +3,7 @@ use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::{LweBskGroupingFactor, UnsignedInteger};
use crate::integer::block_decomposition::{BlockDecomposer, Decomposable, DecomposableInto};
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
use crate::integer::gpu::ciphertext::{CudaIntegerRadixCiphertext, CudaUnsignedRadixCiphertext};
use crate::integer::gpu::server_key::radix::CudaRadixCiphertext;
use crate::integer::gpu::server_key::{CudaBootstrappingKey, CudaServerKey};
@@ -31,14 +32,17 @@ impl CudaServerKey {
.map(|ciphertext| &ciphertext.0.ciphertext.d_blocks),
streams,
);
let blocks_ct: CudaUnsignedRadixCiphertext = CudaUnsignedRadixCiphertext {
ciphertext: CudaRadixCiphertext {
d_blocks: packed_list,
info: selectors[0].0.ciphertext.info.clone(),
},
let vec_block_info: Vec<CudaBlockInfo> = selectors
.iter()
.flat_map(|ct| ct.0.ciphertext.info.blocks.clone())
.collect();
let radix_info = CudaRadixCiphertextInfo {
blocks: vec_block_info,
};
blocks_ct
CudaIntegerRadixCiphertext::from(CudaRadixCiphertext {
d_blocks: packed_list,
info: radix_info,
})
}
#[allow(clippy::unused_self)]
pub(crate) fn convert_radixes_vec_to_single_radix_ciphertext<T>(
@@ -56,10 +60,16 @@ impl CudaServerKey {
radixes.iter().map(|ciphertext| &ciphertext.d_blocks),
streams,
);
let vec_block_info: Vec<CudaBlockInfo> = radixes
.iter()
.flat_map(|ct| ct.info.blocks.clone())
.collect();
let radix_info = CudaRadixCiphertextInfo {
blocks: vec_block_info,
};
CudaIntegerRadixCiphertext::from(CudaRadixCiphertext {
d_blocks: packed_list,
info: radixes[0].info.clone(),
info: radix_info,
})
}

View File

@@ -160,20 +160,15 @@ where
pub(crate) fn fill_accumulator_no_encoding<F, C>(
accumulator: &mut GlweCiphertext<C>,
server_key: &ServerKey,
polynomial_size: PolynomialSize,
glwe_size: GlweSize,
f: F,
) where
C: ContainerMut<Element = u64>,
F: Fn(u64) -> u64,
{
assert_eq!(
accumulator.polynomial_size(),
server_key.bootstrapping_key.polynomial_size()
);
assert_eq!(
accumulator.glwe_size(),
server_key.bootstrapping_key.glwe_size()
);
assert_eq!(accumulator.polynomial_size(), polynomial_size);
assert_eq!(accumulator.glwe_size(), glwe_size);
let mut accumulator_view = accumulator.as_mut_view();

View File

@@ -1,8 +1,8 @@
use super::Ciphertext;
use crate::core_crypto::fft_impl::common::modulus_switch;
use crate::core_crypto::prelude::{
keyswitch_lwe_ciphertext, lwe_ciphertext_plaintext_add_assign, CiphertextModulusLog,
LweCiphertext, LweSize, Plaintext,
keyswitch_lwe_ciphertext, lwe_ciphertext_plaintext_add_assign, CiphertextModulus,
CiphertextModulusLog, LweCiphertext, LweSize, Plaintext,
};
use crate::shortint::ciphertext::Degree;
use crate::shortint::engine::ShortintEngine;
@@ -28,35 +28,33 @@ pub fn sha3_hash(values: &mut [u64], seed: Seed) {
*value = u64::from_le_bytes(bytes);
}
}
pub fn create_random_from_seed(
seed: Seed,
lwe_size: LweSize,
ciphertext_modulus: CiphertextModulus<u64>,
) -> LweCiphertext<Vec<u64>> {
let mut ct = LweCiphertext::new(0, lwe_size, ciphertext_modulus);
sha3_hash(ct.get_mut_mask().as_mut(), seed);
ct
}
pub fn create_random_from_seed_modulus_switched(
seed: Seed,
lwe_size: LweSize,
log_modulus: CiphertextModulusLog,
ciphertext_modulus: CiphertextModulus<u64>,
) -> LweCiphertext<Vec<u64>> {
let mut ct = create_random_from_seed(seed, lwe_size, ciphertext_modulus);
for i in ct.as_mut() {
*i = modulus_switch(*i, log_modulus) << (64 - log_modulus.0);
}
ct
}
impl ServerKey {
pub(crate) fn create_random_from_seed(
&self,
seed: Seed,
lwe_size: LweSize,
) -> LweCiphertext<Vec<u64>> {
let mut ct = LweCiphertext::new(0, lwe_size, self.ciphertext_modulus);
sha3_hash(ct.get_mut_mask().as_mut(), seed);
ct
}
pub(crate) fn create_random_from_seed_modulus_switched(
&self,
seed: Seed,
lwe_size: LweSize,
log_modulus: CiphertextModulusLog,
) -> LweCiphertext<Vec<u64>> {
let mut ct = self.create_random_from_seed(seed, lwe_size);
for i in ct.as_mut() {
*i = modulus_switch(*i, log_modulus) << (64 - log_modulus.0);
}
ct
}
/// Uniformly generates a random encrypted value in `[0, 2^random_bits_count[`
/// `2^random_bits_count` must be smaller than the message modulus
/// The encryted value is oblivious to the server
@@ -123,12 +121,13 @@ impl ServerKey {
let in_lwe_size = self.bootstrapping_key.input_lwe_dimension().to_lwe_size();
let seeded = self.create_random_from_seed_modulus_switched(
let seeded = create_random_from_seed_modulus_switched(
seed,
in_lwe_size,
self.bootstrapping_key
.polynomial_size()
.to_blind_rotation_input_modulus_log(),
self.ciphertext_modulus,
);
let p = 1 << random_bits_count;
@@ -183,6 +182,7 @@ impl ServerKey {
#[cfg(test)]
pub(crate) mod test {
use crate::core_crypto::prelude::decrypt_lwe_ciphertext;
use crate::shortint::oprf::create_random_from_seed_modulus_switched;
use crate::shortint::{ClientKey, ServerKey};
use rayon::prelude::*;
use statrs::distribution::ContinuousCDF;
@@ -223,12 +223,13 @@ pub(crate) mod test {
let lwe_size = sk.bootstrapping_key.input_lwe_dimension().to_lwe_size();
let ct = sk.create_random_from_seed_modulus_switched(
let ct = create_random_from_seed_modulus_switched(
seed,
lwe_size,
sk.bootstrapping_key
.polynomial_size()
.to_blind_rotation_input_modulus_log(),
sk.ciphertext_modulus,
);
let sk = ck.small_lwe_secret_key();

View File

@@ -640,7 +640,12 @@ impl ServerKey {
self.bootstrapping_key.polynomial_size(),
self.ciphertext_modulus,
);
fill_accumulator_no_encoding(&mut acc, self, f);
fill_accumulator_no_encoding(
&mut acc,
self.bootstrapping_key.polynomial_size(),
self.bootstrapping_key.glwe_size(),
f,
);
LookupTableOwned {
acc,