Compare commits

...

2 Commits

Author SHA1 Message Date
Agnes Leroy
7dcbd85a83 chore(gpu): stop using optional arguments altogether 2024-09-02 15:51:45 +02:00
Agnes Leroy
1e453263af chore(gpu): remove device synchronization in drop for CudaVec 2024-09-02 15:09:08 +02:00
13 changed files with 79 additions and 50 deletions

View File

@@ -27,7 +27,7 @@ private:
public:
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
uint32_t num_poly = 1)
uint32_t num_poly)
: base_log(base_log), level_count(level_count), num_poly(num_poly),
state(state) {

View File

@@ -271,7 +271,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
if (!ch_amount)
ch_amount++;
dim3 add_grid(ch_amount, num_blocks, 1);
size_t sm_size = big_lwe_size * sizeof(Torus);
cudaSetDevice(gpu_indexes[0]);
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(

View File

@@ -207,9 +207,9 @@ __global__ void device_programmable_bootstrap_amortized(
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
glwe_dimension, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
glwe_dimension, 0);
}
template <typename Torus>

View File

@@ -98,8 +98,8 @@ __global__ void device_programmable_bootstrap_cg(
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, false,
1);
for (int i = 0; i < lwe_dimension; i++) {
synchronize_threads_in_block();
@@ -111,13 +111,13 @@ __global__ void device_programmable_bootstrap_cg(
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
accumulator, accumulator_rotated, a_hat, 1);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
accumulator_rotated, base_log, level_count, 1);
synchronize_threads_in_block();
@@ -125,7 +125,7 @@ __global__ void device_programmable_bootstrap_cg(
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
accumulator_rotated, 1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -150,9 +150,9 @@ __global__ void device_programmable_bootstrap_cg(
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
}
}

View File

@@ -86,7 +86,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
false, 1);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -98,12 +98,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator, base_log, level_count, 1);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -129,9 +130,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
0);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
0);
}
} else {
// Load the accumulator calculated in previous iterations

View File

@@ -82,7 +82,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
false, 1);
// Persist
int tid = threadIdx.x;
@@ -102,20 +102,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(global_slice,
accumulator, a_hat);
accumulator, a_hat, 1);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator, base_log, level_count, 1);
synchronize_threads_in_block();
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -215,9 +215,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
0);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
0);
}
} else {
// Persist the updated accumulator

View File

@@ -210,7 +210,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
false, 1);
// Persist
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -225,12 +225,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator, base_log, level_count, 1);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -324,9 +324,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice, 1,
0);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0,
0);
}
}
}

View File

@@ -115,13 +115,13 @@ __global__ void device_programmable_bootstrap_tbc(
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
accumulator, accumulator_rotated, a_hat, 1);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
accumulator_rotated, base_log, level_count, 1);
synchronize_threads_in_block();
@@ -154,9 +154,9 @@ __global__ void device_programmable_bootstrap_tbc(
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
}
}

View File

@@ -94,7 +94,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
false, 1);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
@@ -106,12 +106,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator, base_log, level_count, 1);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -137,9 +138,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
0);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
0);
}
} else {
// Load the accumulator calculated in previous iterations

View File

@@ -45,7 +45,7 @@ template <typename T, int elems_per_thread, int block_size>
__device__ void
divide_by_monomial_negacyclic_inplace(T *accumulator,
const T *__restrict__ input, uint32_t j,
bool zeroAcc, uint32_t num_poly = 1) {
bool zeroAcc, uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
@@ -94,7 +94,7 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
@@ -133,7 +133,7 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
template <typename T, int elems_per_thread, int block_size>
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
int level_count,
uint32_t num_poly = 1) {
uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
@@ -192,7 +192,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
// Extracts the body of the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
uint32_t glwe_dimension, uint32_t nth = 0) {
uint32_t glwe_dimension, uint32_t nth) {
// Set first coefficient of the glwe as the body of the LWE sample
lwe_array_out[glwe_dimension * params::degree] =
glwe[glwe_dimension * params::degree + nth];
@@ -201,8 +201,7 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
// Extracts the mask from the nth-LWE in a GLWE.
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
uint32_t glwe_dimension = 1,
uint32_t nth = 0) {
uint32_t glwe_dimension, uint32_t nth) {
for (int z = 0; z < glwe_dimension; z++) {
Torus *lwe_array_out_slice =
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);

View File

@@ -5,16 +5,16 @@ use crate::core_crypto::gpu::{extract_lwe_samples_from_glwe_ciphertext_list_asyn
use crate::core_crypto::prelude::{MonomialDegree, UnsignedTorus};
use itertools::Itertools;
/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
/// GPU-accelerated.
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
/// # Safety
///
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until stream is synchronised
pub unsafe fn cuda_extract_lwe_samples_from_glwe_ciphertext_list_async<Scalar>(
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
vec_nth: &[MonomialDegree],
streams: &CudaStreams,
) where
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus,
{
let in_lwe_dim = input_glwe_list
@@ -58,3 +58,25 @@ pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
);
}
}
/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
/// GPU-accelerated.
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
vec_nth: &[MonomialDegree],
streams: &CudaStreams,
) where
Scalar: UnsignedTorus,
{
unsafe {
cuda_extract_lwe_samples_from_glwe_ciphertext_list_async(
input_glwe_list,
output_lwe_list,
vec_nth,
streams,
);
}
streams.synchronize();
}

View File

@@ -1,5 +1,5 @@
use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
use crate::core_crypto::gpu::{synchronize_device, CudaStreams};
use crate::core_crypto::gpu::CudaStreams;
use crate::core_crypto::prelude::Numeric;
use std::collections::Bound::{Excluded, Included, Unbounded};
use std::ffi::c_void;
@@ -447,8 +447,6 @@ impl<T: Numeric> Drop for CudaVec<T> {
/// Free memory for pointer `ptr` synchronously
fn drop(&mut self) {
for &gpu_index in self.gpu_indexes.iter() {
// Synchronizes the device to be sure no stream is still using this pointer
synchronize_device(gpu_index);
unsafe { cuda_drop(self.get_mut_c_ptr(gpu_index), gpu_index) };
}
}

View File

@@ -370,6 +370,7 @@ impl CudaServerKey {
let mut result = unsafe { ciphertexts[0].duplicate_async(streams) };
if ciphertexts.len() == 1 {
streams.synchronize();
return Some(result);
}