mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 15:48:20 -05:00
Compare commits
2 Commits
mz/ct_mod_
...
al/fixes
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7dcbd85a83 | ||
|
|
1e453263af |
@@ -27,7 +27,7 @@ private:
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
|
||||
uint32_t num_poly = 1)
|
||||
uint32_t num_poly)
|
||||
: base_log(base_log), level_count(level_count), num_poly(num_poly),
|
||||
state(state) {
|
||||
|
||||
|
||||
@@ -271,7 +271,6 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_blocks, 1);
|
||||
size_t sm_size = big_lwe_size * sizeof(Torus);
|
||||
|
||||
cudaSetDevice(gpu_indexes[0]);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
|
||||
@@ -207,9 +207,9 @@ __global__ void device_programmable_bootstrap_amortized(
|
||||
// the resulting constant coefficient of the accumulator
|
||||
// For the mask it's more complicated
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
glwe_dimension, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
glwe_dimension, 0);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -98,8 +98,8 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat, false,
|
||||
1);
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
synchronize_threads_in_block();
|
||||
@@ -111,13 +111,13 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
accumulator, accumulator_rotated, a_hat, 1);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
accumulator_rotated, base_log, level_count, 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -125,7 +125,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
accumulator_rotated, 1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -150,9 +150,9 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -86,7 +86,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
false, 1);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
@@ -98,12 +98,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator, base_log, level_count, 1);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
|
||||
1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -129,9 +130,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
|
||||
0);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
|
||||
0);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
|
||||
@@ -82,7 +82,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
false, 1);
|
||||
|
||||
// Persist
|
||||
int tid = threadIdx.x;
|
||||
@@ -102,20 +102,20 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(global_slice,
|
||||
accumulator, a_hat);
|
||||
accumulator, a_hat, 1);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator, base_log, level_count, 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -215,9 +215,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
|
||||
0);
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
|
||||
0);
|
||||
}
|
||||
} else {
|
||||
// Persist the updated accumulator
|
||||
|
||||
@@ -210,7 +210,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
false, 1);
|
||||
|
||||
// Persist
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
@@ -225,12 +225,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator, base_log, level_count, 1);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator, 1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -324,9 +324,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice, 1,
|
||||
0);
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0,
|
||||
0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,13 +115,13 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
accumulator, accumulator_rotated, a_hat, 1);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
accumulator_rotated, base_log, level_count, 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -154,9 +154,9 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1, 0);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
false, 1);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
@@ -106,12 +106,13 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator, base_log, level_count, 1);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator,
|
||||
1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -137,9 +138,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1,
|
||||
0);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0,
|
||||
0);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
|
||||
@@ -45,7 +45,7 @@ template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void
|
||||
divide_by_monomial_negacyclic_inplace(T *accumulator,
|
||||
const T *__restrict__ input, uint32_t j,
|
||||
bool zeroAcc, uint32_t num_poly = 1) {
|
||||
bool zeroAcc, uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
|
||||
@@ -94,7 +94,7 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
|
||||
T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
|
||||
@@ -133,7 +133,7 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
|
||||
int level_count,
|
||||
uint32_t num_poly = 1) {
|
||||
uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
|
||||
@@ -192,7 +192,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
|
||||
// Extracts the body of the nth-LWE in a GLWE.
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
|
||||
uint32_t glwe_dimension, uint32_t nth = 0) {
|
||||
uint32_t glwe_dimension, uint32_t nth) {
|
||||
// Set first coefficient of the glwe as the body of the LWE sample
|
||||
lwe_array_out[glwe_dimension * params::degree] =
|
||||
glwe[glwe_dimension * params::degree + nth];
|
||||
@@ -201,8 +201,7 @@ __device__ void sample_extract_body(Torus *lwe_array_out, Torus *glwe,
|
||||
// Extracts the mask from the nth-LWE in a GLWE.
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *glwe,
|
||||
uint32_t glwe_dimension = 1,
|
||||
uint32_t nth = 0) {
|
||||
uint32_t glwe_dimension, uint32_t nth) {
|
||||
for (int z = 0; z < glwe_dimension; z++) {
|
||||
Torus *lwe_array_out_slice =
|
||||
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
|
||||
|
||||
@@ -5,16 +5,16 @@ use crate::core_crypto::gpu::{extract_lwe_samples_from_glwe_ciphertext_list_asyn
|
||||
use crate::core_crypto::prelude::{MonomialDegree, UnsignedTorus};
|
||||
use itertools::Itertools;
|
||||
|
||||
/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
|
||||
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
|
||||
/// GPU-accelerated.
|
||||
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
|
||||
/// # Safety
|
||||
///
|
||||
/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until stream is synchronised
|
||||
pub unsafe fn cuda_extract_lwe_samples_from_glwe_ciphertext_list_async<Scalar>(
|
||||
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
|
||||
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
|
||||
vec_nth: &[MonomialDegree],
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus,
|
||||
{
|
||||
let in_lwe_dim = input_glwe_list
|
||||
@@ -58,3 +58,25 @@ pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// For each [`GLWE Ciphertext`] (`CudaGlweCiphertextList`) given as input, extract the nth
|
||||
/// coefficient from its body as an [`LWE ciphertext`](`CudaLweCiphertextList`). This variant is
|
||||
/// GPU-accelerated.
|
||||
pub fn cuda_extract_lwe_samples_from_glwe_ciphertext_list<Scalar>(
|
||||
input_glwe_list: &CudaGlweCiphertextList<Scalar>,
|
||||
output_lwe_list: &mut CudaLweCiphertextList<Scalar>,
|
||||
vec_nth: &[MonomialDegree],
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
Scalar: UnsignedTorus,
|
||||
{
|
||||
unsafe {
|
||||
cuda_extract_lwe_samples_from_glwe_ciphertext_list_async(
|
||||
input_glwe_list,
|
||||
output_lwe_list,
|
||||
vec_nth,
|
||||
streams,
|
||||
);
|
||||
}
|
||||
streams.synchronize();
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
|
||||
use crate::core_crypto::gpu::{synchronize_device, CudaStreams};
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::core_crypto::prelude::Numeric;
|
||||
use std::collections::Bound::{Excluded, Included, Unbounded};
|
||||
use std::ffi::c_void;
|
||||
@@ -447,8 +447,6 @@ impl<T: Numeric> Drop for CudaVec<T> {
|
||||
/// Free memory for pointer `ptr` synchronously
|
||||
fn drop(&mut self) {
|
||||
for &gpu_index in self.gpu_indexes.iter() {
|
||||
// Synchronizes the device to be sure no stream is still using this pointer
|
||||
synchronize_device(gpu_index);
|
||||
unsafe { cuda_drop(self.get_mut_c_ptr(gpu_index), gpu_index) };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -370,6 +370,7 @@ impl CudaServerKey {
|
||||
let mut result = unsafe { ciphertexts[0].duplicate_async(streams) };
|
||||
|
||||
if ciphertexts.len() == 1 {
|
||||
streams.synchronize();
|
||||
return Some(result);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user