mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-10 07:08:03 -05:00
fix(gpu): add indexes to modulus switch noise reduction
This commit is contained in:
committed by
Agnès Leroy
parent
45fdba04b1
commit
c19cd9f021
@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
|
||||
|
||||
void cuda_improve_noise_modulus_switch_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus);
|
||||
void const *lwe_array_in, void const *lwe_array_indexes,
|
||||
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
|
||||
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
|
||||
uint32_t log_modulus);
|
||||
|
||||
void cuda_glwe_sample_extract_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
|
||||
@@ -248,6 +248,7 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
__uint128_t *temp_lwe_array_in;
|
||||
uint64_t *trivial_indexes;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
bool uses_noise_reduction;
|
||||
@@ -263,11 +264,27 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
cuda_set_device(gpu_index);
|
||||
this->pbs_variant = pbs_variant;
|
||||
this->uses_noise_reduction = allocate_ms_array;
|
||||
this->temp_lwe_array_in =
|
||||
(__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index, size_tracker, allocate_ms_array);
|
||||
if (allocate_ms_array) {
|
||||
this->temp_lwe_array_in =
|
||||
(__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index, size_tracker, allocate_ms_array);
|
||||
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
||||
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
|
||||
size_tracker, allocate_ms_array);
|
||||
uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
|
||||
for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
|
||||
h_trivial_indexes[i] = i;
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
trivial_indexes, h_trivial_indexes,
|
||||
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
|
||||
allocate_gpu_memory);
|
||||
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
delete[] h_trivial_indexes;
|
||||
}
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
@@ -404,9 +421,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
|
||||
if (uses_noise_reduction)
|
||||
if (uses_noise_reduction) {
|
||||
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -86,13 +86,15 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
|
||||
|
||||
void cuda_improve_noise_modulus_switch_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus) {
|
||||
void const *lwe_array_in, void const *lwe_array_indexes,
|
||||
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
|
||||
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
|
||||
uint32_t log_modulus) {
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_array_indexes),
|
||||
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
|
||||
num_zeros, input_variance, r_sigma, bound, log_modulus);
|
||||
}
|
||||
|
||||
@@ -178,11 +178,10 @@ __device__ __forceinline__ double measure_modulus_switch_noise(
|
||||
|
||||
// Each thread processes two elements of the lwe array
|
||||
template <typename Torus>
|
||||
__global__ void
|
||||
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
const Torus *zeros, int lwe_size, int num_zeros,
|
||||
double input_variance, double r_sigma,
|
||||
double bound, uint32_t log_modulus) {
|
||||
__global__ void improve_noise_modulus_switch(
|
||||
Torus *array_out, const Torus *array_in, const uint64_t *indexes,
|
||||
const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus) {
|
||||
|
||||
// First we will assume size is less than the number of threads per block
|
||||
// I should switch this to dynamic shared memory
|
||||
@@ -198,13 +197,13 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
// This probably are not needed cause we are setting the values
|
||||
sum_mask_errors[threadIdx.x] = 0.f;
|
||||
sum_squared_mask_errors[threadIdx.x] = 0.f;
|
||||
auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
|
||||
auto this_block_lwe_out = array_out + indexes[blockIdx.x] * lwe_size;
|
||||
Torus input_element1 = this_block_lwe_in[threadIdx.x];
|
||||
|
||||
Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
|
||||
|
||||
Torus input_element2 =
|
||||
threadIdx.x + blockDim.x < lwe_size
|
||||
? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
|
||||
: 0;
|
||||
Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
|
||||
? this_block_lwe_in[threadIdx.x + blockDim.x]
|
||||
: 0;
|
||||
|
||||
// Base noise is only handled by thread 0
|
||||
double base_noise = measure_modulus_switch_noise<Torus>(
|
||||
@@ -218,11 +217,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
__syncthreads();
|
||||
|
||||
if (found)
|
||||
array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
|
||||
this_block_lwe_out[threadIdx.x] = input_element1;
|
||||
|
||||
if (found && (threadIdx.x + blockDim.x) < lwe_size)
|
||||
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
|
||||
input_element2;
|
||||
this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;
|
||||
|
||||
__syncthreads();
|
||||
// If we found a zero element we stop iterating (in avg 20 times are
|
||||
@@ -253,11 +251,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
// Assumption we always have at least 512 elements
|
||||
// If we find a useful zero encryption we replace the lwe by lwe + zero
|
||||
if (found)
|
||||
array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
|
||||
this_block_lwe_out[threadIdx.x] = zero_element1;
|
||||
|
||||
if (found && (threadIdx.x + blockDim.x) < lwe_size)
|
||||
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
|
||||
zero_element2;
|
||||
this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;
|
||||
|
||||
__syncthreads();
|
||||
// If we found a zero element we stop iterating (in avg 20 times are
|
||||
@@ -270,9 +267,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
template <typename Torus>
|
||||
__host__ void host_improve_noise_modulus_switch(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
|
||||
Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
|
||||
uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
|
||||
const double r_sigma, const double bound, uint32_t log_modulus) {
|
||||
Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
|
||||
uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
|
||||
const double input_variance, const double r_sigma, const double bound,
|
||||
uint32_t log_modulus) {
|
||||
|
||||
if (lwe_size < 512) {
|
||||
PANIC("The lwe_size is less than 512, this is not supported\n");
|
||||
@@ -289,8 +287,8 @@ __host__ void host_improve_noise_modulus_switch(
|
||||
int num_threads = 512, num_blocks = num_lwes;
|
||||
|
||||
improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
|
||||
array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
|
||||
bound, log_modulus);
|
||||
array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
|
||||
r_sigma, bound, log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
@@ -194,7 +194,8 @@ void execute_pbs_async(
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
void *zeros = nullptr;
|
||||
if (ms_noise_reduction_key != nullptr)
|
||||
if (ms_noise_reduction_key != nullptr &&
|
||||
ms_noise_reduction_key->ptr != nullptr)
|
||||
zeros = ms_noise_reduction_key->ptr[i];
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
|
||||
@@ -660,13 +660,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (ms_noise_reduction_key != nullptr) {
|
||||
if (ms_noise_reduction_key != nullptr &&
|
||||
ms_noise_reduction_key->ptr != nullptr) {
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
buffer->temp_lwe_array_in,
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
|
||||
num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
|
||||
@@ -256,6 +256,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<__uint128_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(buffer->trivial_indexes),
|
||||
static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
|
||||
@@ -50,6 +50,7 @@ unsafe extern "C" {
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_array_indexes: *const ffi::c_void,
|
||||
encrypted_zeros: *const ffi::c_void,
|
||||
lwe_size: u32,
|
||||
num_lwes: u32,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::super::test::TestResources;
|
||||
use crate::core_crypto::commons::test_tools::{check_both_ratio_under, mean, variance};
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::core_crypto::gpu::{CudaStreams, CudaVec};
|
||||
use crate::core_crypto::prelude::*;
|
||||
|
||||
use crate::core_crypto::gpu::GpuIndex;
|
||||
@@ -147,6 +147,10 @@ fn check_noise_improve_modulus_switch_noise(
|
||||
|
||||
let gpu_index = 0;
|
||||
let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
let num_blocks = 1;
|
||||
let lwe_indexes: Vec<u64> = (0..num_blocks).map(|x| x as u64).collect();
|
||||
let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &streams, 0) };
|
||||
unsafe { d_input_indexes.copy_from_cpu_async(&lwe_indexes, &streams, 0) };
|
||||
|
||||
let d_encryptions_of_zero = CudaLweCiphertextList::from_lwe_ciphertext_list(
|
||||
&encryptions_of_zero,
|
||||
@@ -186,6 +190,7 @@ fn check_noise_improve_modulus_switch_noise(
|
||||
streams.gpu_indexes[0].get(),
|
||||
d_ct.0.d_vec.as_mut_c_ptr(0),
|
||||
d_ct_in.0.d_vec.as_c_ptr(0),
|
||||
d_input_indexes.as_c_ptr(0),
|
||||
d_encryptions_of_zero.0.d_vec.as_c_ptr(0),
|
||||
lwe_dimension.to_lwe_size().0 as u32,
|
||||
d_ct.lwe_ciphertext_count().0 as u32,
|
||||
|
||||
@@ -606,40 +606,6 @@ pub unsafe fn cuda_modulus_switch_ciphertext_async<T: UnsignedInteger>(
|
||||
);
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
///
|
||||
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
|
||||
/// required
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_improve_noise_modulus_switch_ciphertext_async<T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
lwe_array_out: &mut CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<T>,
|
||||
encrypted_zeros: &CudaVec<T>,
|
||||
lwe_dimension: LweDimension,
|
||||
num_samples: u32,
|
||||
num_zeros: u32,
|
||||
input_variance: f64,
|
||||
r_sigma_factor: f64,
|
||||
bound: f64,
|
||||
log_modulus: u32,
|
||||
) {
|
||||
cuda_improve_noise_modulus_switch_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
encrypted_zeros.as_c_ptr(0),
|
||||
lwe_dimension.to_lwe_size().0 as u32,
|
||||
num_samples,
|
||||
num_zeros,
|
||||
input_variance,
|
||||
r_sigma_factor,
|
||||
bound,
|
||||
log_modulus,
|
||||
);
|
||||
}
|
||||
|
||||
/// Addition of a vector of LWE ciphertexts
|
||||
///
|
||||
/// # Safety
|
||||
|
||||
Reference in New Issue
Block a user