fix(gpu): add indexes to modulus switch noise reduction

This commit is contained in:
Guillermo Oyarzun
2025-05-13 13:33:46 +02:00
committed by Agnès Leroy
parent 45fdba04b1
commit c19cd9f021
10 changed files with 68 additions and 71 deletions

View File

@@ -28,9 +28,10 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
void cuda_improve_noise_modulus_switch_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
double r_sigma, double bound, uint32_t log_modulus);
void const *lwe_array_in, void const *lwe_array_indexes,
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
uint32_t log_modulus);
void cuda_glwe_sample_extract_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,

View File

@@ -248,6 +248,7 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
__uint128_t *global_accumulator;
double *global_join_buffer;
__uint128_t *temp_lwe_array_in;
uint64_t *trivial_indexes;
PBS_VARIANT pbs_variant;
bool uses_noise_reduction;
@@ -263,11 +264,27 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;
this->uses_noise_reduction = allocate_ms_array;
this->temp_lwe_array_in =
(__uint128_t *)cuda_malloc_with_size_tracking_async(
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(__uint128_t),
stream, gpu_index, size_tracker, allocate_ms_array);
if (allocate_ms_array) {
this->temp_lwe_array_in =
(__uint128_t *)cuda_malloc_with_size_tracking_async(
(lwe_dimension + 1) * input_lwe_ciphertext_count *
sizeof(__uint128_t),
stream, gpu_index, size_tracker, allocate_ms_array);
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
size_tracker, allocate_ms_array);
uint64_t *h_trivial_indexes = new uint64_t[input_lwe_ciphertext_count];
for (uint32_t i = 0; i < input_lwe_ciphertext_count; i++)
h_trivial_indexes[i] = i;
cuda_memcpy_with_size_tracking_async_to_gpu(
trivial_indexes, h_trivial_indexes,
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
allocate_gpu_memory);
cuda_synchronize_stream(stream, gpu_index);
delete[] h_trivial_indexes;
}
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
size_t global_join_buffer_size = (glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
@@ -404,9 +421,12 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
cuda_drop_with_size_tracking_async(global_accumulator, stream, gpu_index,
gpu_memory_allocated);
if (uses_noise_reduction)
if (uses_noise_reduction) {
cuda_drop_with_size_tracking_async(temp_lwe_array_in, stream, gpu_index,
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(trivial_indexes, stream, gpu_index,
gpu_memory_allocated);
}
}
};

View File

@@ -86,13 +86,15 @@ void cuda_modulus_switch_inplace_64(void *stream, uint32_t gpu_index,
void cuda_improve_noise_modulus_switch_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_array_in, void const *encrypted_zeros, uint32_t lwe_size,
uint32_t num_lwes, uint32_t num_zeros, double input_variance,
double r_sigma, double bound, uint32_t log_modulus) {
void const *lwe_array_in, void const *lwe_array_indexes,
void const *encrypted_zeros, uint32_t lwe_size, uint32_t num_lwes,
uint32_t num_zeros, double input_variance, double r_sigma, double bound,
uint32_t log_modulus) {
host_improve_noise_modulus_switch<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<uint64_t *>(lwe_array_out),
static_cast<uint64_t const *>(lwe_array_in),
static_cast<uint64_t const *>(lwe_array_indexes),
static_cast<const uint64_t *>(encrypted_zeros), lwe_size, num_lwes,
num_zeros, input_variance, r_sigma, bound, log_modulus);
}

View File

@@ -178,11 +178,10 @@ __device__ __forceinline__ double measure_modulus_switch_noise(
// Each thread processes two elements of the lwe array
template <typename Torus>
__global__ void
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
const Torus *zeros, int lwe_size, int num_zeros,
double input_variance, double r_sigma,
double bound, uint32_t log_modulus) {
__global__ void improve_noise_modulus_switch(
Torus *array_out, const Torus *array_in, const uint64_t *indexes,
const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
double r_sigma, double bound, uint32_t log_modulus) {
// First we will assume size is less than the number of threads per block
// I should switch this to dynamic shared memory
@@ -198,13 +197,13 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
// This probably are not needed cause we are setting the values
sum_mask_errors[threadIdx.x] = 0.f;
sum_squared_mask_errors[threadIdx.x] = 0.f;
auto this_block_lwe_in = array_in + indexes[blockIdx.x] * lwe_size;
auto this_block_lwe_out = array_out + indexes[blockIdx.x] * lwe_size;
Torus input_element1 = this_block_lwe_in[threadIdx.x];
Torus input_element1 = array_in[threadIdx.x + blockIdx.x * lwe_size];
Torus input_element2 =
threadIdx.x + blockDim.x < lwe_size
? array_in[threadIdx.x + blockDim.x + blockIdx.x * lwe_size]
: 0;
Torus input_element2 = threadIdx.x + blockDim.x < lwe_size
? this_block_lwe_in[threadIdx.x + blockDim.x]
: 0;
// Base noise is only handled by thread 0
double base_noise = measure_modulus_switch_noise<Torus>(
@@ -218,11 +217,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
__syncthreads();
if (found)
array_out[threadIdx.x + blockIdx.x * lwe_size] = input_element1;
this_block_lwe_out[threadIdx.x] = input_element1;
if (found && (threadIdx.x + blockDim.x) < lwe_size)
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
input_element2;
this_block_lwe_out[threadIdx.x + blockDim.x] = input_element2;
__syncthreads();
// If we found a zero element we stop iterating (in avg 20 times are
@@ -253,11 +251,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
// Assumption we always have at least 512 elements
// If we find a useful zero encryption we replace the lwe by lwe + zero
if (found)
array_out[threadIdx.x + blockIdx.x * lwe_size] = zero_element1;
this_block_lwe_out[threadIdx.x] = zero_element1;
if (found && (threadIdx.x + blockDim.x) < lwe_size)
array_out[threadIdx.x + blockDim.x + blockIdx.x * lwe_size] =
zero_element2;
this_block_lwe_out[threadIdx.x + blockDim.x] = zero_element2;
__syncthreads();
// If we found a zero element we stop iterating (in avg 20 times are
@@ -270,9 +267,10 @@ improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
template <typename Torus>
__host__ void host_improve_noise_modulus_switch(
cudaStream_t stream, uint32_t gpu_index, Torus *array_out,
Torus const *array_in, const Torus *zeros, uint32_t lwe_size,
uint32_t num_lwes, const uint32_t num_zeros, const double input_variance,
const double r_sigma, const double bound, uint32_t log_modulus) {
Torus const *array_in, uint64_t const *indexes, const Torus *zeros,
uint32_t lwe_size, uint32_t num_lwes, const uint32_t num_zeros,
const double input_variance, const double r_sigma, const double bound,
uint32_t log_modulus) {
if (lwe_size < 512) {
PANIC("The lwe_size is less than 512, this is not supported\n");
@@ -289,8 +287,8 @@ __host__ void host_improve_noise_modulus_switch(
int num_threads = 512, num_blocks = num_lwes;
improve_noise_modulus_switch<Torus><<<num_blocks, num_threads, 0, stream>>>(
array_out, array_in, zeros, lwe_size, num_zeros, input_variance, r_sigma,
bound, log_modulus);
array_out, array_in, indexes, zeros, lwe_size, num_zeros, input_variance,
r_sigma, bound, log_modulus);
check_cuda_error(cudaGetLastError());
}

View File

@@ -194,7 +194,8 @@ void execute_pbs_async(
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
void *zeros = nullptr;
if (ms_noise_reduction_key != nullptr)
if (ms_noise_reduction_key != nullptr &&
ms_noise_reduction_key->ptr != nullptr)
zeros = ms_noise_reduction_key->ptr[i];
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams[i], gpu_indexes[i], current_lwe_array_out,

View File

@@ -660,13 +660,15 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
// If the parameters contain noise reduction key, then apply it
if (ms_noise_reduction_key != nullptr) {
if (ms_noise_reduction_key != nullptr &&
ms_noise_reduction_key->ptr != nullptr) {
if (ms_noise_reduction_key->num_zeros != 0) {
uint32_t log_modulus = log2(polynomial_size) + 1;
host_improve_noise_modulus_switch<uint64_t>(
static_cast<cudaStream_t>(stream), gpu_index,
buffer->temp_lwe_array_in,
static_cast<uint64_t const *>(lwe_array_in),
static_cast<uint64_t const *>(lwe_input_indexes),
static_cast<uint64_t *>(ms_noise_reduction_ptr), lwe_dimension + 1,
num_samples, ms_noise_reduction_key->num_zeros,
ms_noise_reduction_key->ms_input_variance,

View File

@@ -256,6 +256,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
static_cast<cudaStream_t>(stream), gpu_index,
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
static_cast<__uint128_t const *>(lwe_array_in),
static_cast<uint64_t const *>(buffer->trivial_indexes),
static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
ms_noise_reduction_key->ms_input_variance,

View File

@@ -50,6 +50,7 @@ unsafe extern "C" {
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_array_indexes: *const ffi::c_void,
encrypted_zeros: *const ffi::c_void,
lwe_size: u32,
num_lwes: u32,

View File

@@ -1,7 +1,7 @@
use super::super::test::TestResources;
use crate::core_crypto::commons::test_tools::{check_both_ratio_under, mean, variance};
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::gpu::{CudaStreams, CudaVec};
use crate::core_crypto::prelude::*;
use crate::core_crypto::gpu::GpuIndex;
@@ -147,6 +147,10 @@ fn check_noise_improve_modulus_switch_noise(
let gpu_index = 0;
let streams = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
let num_blocks = 1;
let lwe_indexes: Vec<u64> = (0..num_blocks).map(|x| x as u64).collect();
let mut d_input_indexes = unsafe { CudaVec::<u64>::new_async(num_blocks, &streams, 0) };
unsafe { d_input_indexes.copy_from_cpu_async(&lwe_indexes, &streams, 0) };
let d_encryptions_of_zero = CudaLweCiphertextList::from_lwe_ciphertext_list(
&encryptions_of_zero,
@@ -186,6 +190,7 @@ fn check_noise_improve_modulus_switch_noise(
streams.gpu_indexes[0].get(),
d_ct.0.d_vec.as_mut_c_ptr(0),
d_ct_in.0.d_vec.as_c_ptr(0),
d_input_indexes.as_c_ptr(0),
d_encryptions_of_zero.0.d_vec.as_c_ptr(0),
lwe_dimension.to_lwe_size().0 as u32,
d_ct.lwe_ciphertext_count().0 as u32,

View File

@@ -606,40 +606,6 @@ pub unsafe fn cuda_modulus_switch_ciphertext_async<T: UnsignedInteger>(
);
}
/// # Safety
///
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
#[allow(clippy::too_many_arguments)]
pub unsafe fn cuda_improve_noise_modulus_switch_ciphertext_async<T: UnsignedInteger>(
streams: &CudaStreams,
lwe_array_out: &mut CudaVec<T>,
lwe_array_in: &CudaVec<T>,
encrypted_zeros: &CudaVec<T>,
lwe_dimension: LweDimension,
num_samples: u32,
num_zeros: u32,
input_variance: f64,
r_sigma_factor: f64,
bound: f64,
log_modulus: u32,
) {
cuda_improve_noise_modulus_switch_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_array_in.as_c_ptr(0),
encrypted_zeros.as_c_ptr(0),
lwe_dimension.to_lwe_size().0 as u32,
num_samples,
num_zeros,
input_variance,
r_sigma_factor,
bound,
log_modulus,
);
}
/// Addition of a vector of LWE ciphertexts
///
/// # Safety