mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
refactor(cuda): Refactor the amortized PBS.
This commit is contained in:
@@ -13,8 +13,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
|
||||
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
@@ -26,36 +24,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_amortized<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -136,8 +139,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
|
||||
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
@@ -149,36 +150,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_amortized<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -53,8 +53,8 @@ template <typename Torus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_bootstrap_amortized(
|
||||
Torus *lwe_array_out, Torus *lut_vector, uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_array_in, double2 *bootstrapping_key, char *device_mem,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t lwe_idx,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
|
||||
size_t device_memory_size_per_sample) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
@@ -69,26 +69,22 @@ __global__ void device_bootstrap_amortized(
|
||||
|
||||
// For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
|
||||
// one mask polynomial and 1 body to handle.
|
||||
Torus *accumulator_mask = (Torus *)selected_memory;
|
||||
Torus *accumulator_body =
|
||||
(Torus *)accumulator_mask + (ptrdiff_t)polynomial_size;
|
||||
Torus *accumulator_mask_rotated =
|
||||
(Torus *)accumulator_body + (ptrdiff_t)polynomial_size;
|
||||
Torus *accumulator_body_rotated =
|
||||
(Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size;
|
||||
double2 *mask_res_fft = (double2 *)accumulator_body_rotated +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
double2 *body_res_fft =
|
||||
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated =
|
||||
(Torus *)accumulator +
|
||||
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
|
||||
double2 *res_fft =
|
||||
(double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
|
||||
(sizeof(double2) / sizeof(Torus));
|
||||
double2 *accumulator_fft = (double2 *)sharedmem;
|
||||
if constexpr (SMD != PARTIALSM)
|
||||
accumulator_fft =
|
||||
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
|
||||
accumulator_fft = (double2 *)res_fft +
|
||||
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
|
||||
|
||||
auto block_lwe_array_in = &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
|
||||
Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
|
||||
2];
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
// Put "b", the body, in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
@@ -97,11 +93,7 @@ __global__ void device_bootstrap_amortized(
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_mask, block_lut_vector, b_hat, false);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_body, &block_lut_vector[params::degree], b_hat, false);
|
||||
accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
|
||||
|
||||
// Loop over all the mask elements of the sample to accumulate
|
||||
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
|
||||
@@ -118,11 +110,7 @@ __global__ void device_bootstrap_amortized(
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator_mask, accumulator_mask_rotated, a_hat);
|
||||
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator_body, accumulator_body_rotated, a_hat);
|
||||
accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -130,153 +118,122 @@ __global__ void device_bootstrap_amortized(
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_mask_rotated, base_log, level_count);
|
||||
accumulator_rotated, base_log, level_count, glwe_dimension + 1);
|
||||
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_body_rotated, base_log, level_count);
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
// The polynomial multiplications happens at the block level
|
||||
// and each thread handles two or more coefficients
|
||||
int pos = threadIdx.x;
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
mask_res_fft[pos].x = 0;
|
||||
mask_res_fft[pos].y = 0;
|
||||
body_res_fft[pos].x = 0;
|
||||
body_res_fft[pos].y = 0;
|
||||
for (int j = 0; j < (glwe_dimension + 1) * params::opt / 2; j++) {
|
||||
res_fft[pos].x = 0;
|
||||
res_fft[pos].y = 0;
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
|
||||
GadgetMatrix<Torus, params> gadget_mask(base_log, level_count,
|
||||
accumulator_mask_rotated);
|
||||
GadgetMatrix<Torus, params> gadget_body(base_log, level_count,
|
||||
accumulator_body_rotated);
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, level_count,
|
||||
accumulator_rotated, glwe_dimension + 1);
|
||||
// Now that the rotation is done, decompose the resulting polynomial
|
||||
// coefficients so as to multiply each decomposed level with the
|
||||
// corresponding part of the bootstrapping key
|
||||
for (int level = level_count - 1; level >= 0; level--) {
|
||||
gadget.decompose_and_compress_next(accumulator_fft);
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++) {
|
||||
auto accumulator_fft_slice = accumulator_fft + i * params::degree / 2;
|
||||
|
||||
gadget_mask.decompose_and_compress_next(accumulator_fft);
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft_slice);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
|
||||
level, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the coefficient-wise product with the two pieces of
|
||||
// bootstrapping key
|
||||
for (int j = 0; j < (glwe_dimension + 1); j++) {
|
||||
auto bsk_poly = bsk_slice + j * params::degree / 2;
|
||||
auto res_fft_poly = res_fft + j * params::degree / 2;
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
res_fft_poly, accumulator_fft_slice, bsk_poly);
|
||||
}
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_mask_slice =
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, 0, level,
|
||||
polynomial_size, 1, level_count);
|
||||
auto bsk_body_slice =
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, 0, level,
|
||||
polynomial_size, 1, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the coefficient-wise product with the two pieces of
|
||||
// bootstrapping key
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, accumulator_fft, bsk_body_slice);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
gadget_body.decompose_and_compress_next(accumulator_fft);
|
||||
|
||||
// Now handle the polynomial multiplication for the body
|
||||
// in the same way
|
||||
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
auto bsk_mask_slice_2 =
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, 1, level,
|
||||
polynomial_size, 1, level_count);
|
||||
auto bsk_body_slice_2 =
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, 1, level,
|
||||
polynomial_size, 1, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice_2);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, accumulator_fft, bsk_body_slice_2);
|
||||
}
|
||||
|
||||
// Come back to the coefficient representation
|
||||
if constexpr (SMD == FULLSM || SMD == NOSM) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(mask_res_fft);
|
||||
NSMFFT_inverse<HalfDegree<params>>(body_res_fft);
|
||||
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++) {
|
||||
auto res_fft_slice = res_fft + i * params::degree / 2;
|
||||
NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(mask_res_fft, accumulator_mask);
|
||||
add_to_torus<Torus, params>(body_res_fft, accumulator_body);
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++) {
|
||||
auto accumulator_slice = accumulator + i * params::degree;
|
||||
auto res_fft_slice = res_fft + i * params::degree / 2;
|
||||
add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
} else {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
accumulator_fft[tid] = mask_res_fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
for (int i = 0; i < (glwe_dimension + 1); i++) {
|
||||
auto accumulator_slice = accumulator + i * params::degree;
|
||||
auto res_fft_slice = res_fft + i * params::degree / 2;
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
accumulator_fft[tid] = res_fft_slice[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_mask);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
accumulator_fft[tid] = body_res_fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_body);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
}
|
||||
|
||||
auto block_lwe_array_out = &lwe_array_out[blockIdx.x * (polynomial_size + 1)];
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1)];
|
||||
|
||||
// The blind rotation for this block is over
|
||||
// Now we can perform the sample extraction: for the body it's just
|
||||
// the resulting constant coefficient of the accumulator
|
||||
// For the mask it's more complicated
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator_mask);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator_body);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
|
||||
glwe_dimension);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_amortized(
|
||||
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes, Torus *lwe_array_in,
|
||||
double2 *bootstrapping_key, uint32_t input_lwe_dimension,
|
||||
double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
|
||||
sizeof(Torus) * polynomial_size + // accumulator body
|
||||
sizeof(Torus) * polynomial_size + // accumulator mask rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator body rotated
|
||||
sizeof(double2) * polynomial_size / 2 + // accumulator fft mask
|
||||
sizeof(double2) * polynomial_size / 2 + // accumulator fft body
|
||||
sizeof(double2) * polynomial_size / 2; // calculate buffer fft
|
||||
int SM_FULL =
|
||||
sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
|
||||
sizeof(Torus) * polynomial_size *
|
||||
(glwe_dimension + 1) + // accumulator rotated
|
||||
sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1) + // accumulator fft
|
||||
sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1); // calculate buffer fft
|
||||
|
||||
int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft
|
||||
int SM_PART = sizeof(double2) * polynomial_size / 2 *
|
||||
(glwe_dimension + 1); // calculate buffer fft
|
||||
|
||||
int DM_PART = SM_FULL - SM_PART;
|
||||
|
||||
@@ -304,8 +261,8 @@ __host__ void host_bootstrap_amortized(
|
||||
stream, gpu_index);
|
||||
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, level_count, lwe_idx, DM_FULL);
|
||||
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
|
||||
@@ -316,8 +273,8 @@ __host__ void host_bootstrap_amortized(
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, level_count, lwe_idx, DM_PART);
|
||||
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, lwe_idx, DM_PART);
|
||||
} else {
|
||||
// For devices with compute capability 7.x a single thread block can
|
||||
// address the full capacity of shared memory. Shared memory on the
|
||||
@@ -335,11 +292,10 @@ __host__ void host_bootstrap_amortized(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, *stream>>>(
|
||||
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, level_count, lwe_idx, 0);
|
||||
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, lwe_idx, 0);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
|
||||
@@ -182,11 +182,11 @@ __global__ void device_bootstrap_low_latency(
|
||||
if (blockIdx.y == 0) {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, block_lut_vector, b_hat, false);
|
||||
accumulator, block_lut_vector, b_hat, false, 1);
|
||||
} else {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[params::degree], b_hat, false);
|
||||
accumulator, &block_lut_vector[params::degree], b_hat, false, 1);
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
@@ -200,13 +200,13 @@ __global__ void device_bootstrap_low_latency(
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
accumulator, accumulator_rotated, a_hat, 1);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
accumulator_rotated, base_log, level_count, 1);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -214,7 +214,7 @@ __global__ void device_bootstrap_low_latency(
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
accumulator_rotated, 1);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -234,9 +234,9 @@ __global__ void device_bootstrap_low_latency(
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator);
|
||||
block_lwe_array_out[params::degree] = accumulator[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -144,8 +144,8 @@ __host__ void host_circuit_bootstrap(
|
||||
host_bootstrap_amortized<Torus, params>(
|
||||
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
|
||||
lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk,
|
||||
lwe_dimension, polynomial_size, base_log_bsk, level_bsk, pbs_count,
|
||||
level_cbs, 0, max_shared_memory);
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
|
||||
pbs_count, level_cbs, 0, max_shared_memory);
|
||||
|
||||
dim3 copy_grid(pbs_count * (glwe_dimension + 1), 1, 1);
|
||||
dim3 copy_block(params::degree / params::opt, 1, 1);
|
||||
|
||||
@@ -15,6 +15,7 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
level_count;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////
|
||||
template <typename T>
|
||||
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
|
||||
uint32_t polynomial_size,
|
||||
@@ -35,8 +36,9 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) *
|
||||
(glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1) +
|
||||
polynomial_size / 2];
|
||||
glwe_dimension * polynomial_size / 2];
|
||||
}
|
||||
////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename ST>
|
||||
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
|
||||
@@ -4,6 +4,15 @@
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
/**
|
||||
* GadgetMatrix implements the iterator design pattern to decompose a set of
|
||||
* num_poly consecutive polynomials with degree params::degree. A total of
|
||||
* level_count levels is expected and each call to decompose_and_compress_next()
|
||||
* writes to the result the next level. It is also possible to advance an
|
||||
* arbitrary amount of levels by using decompose_and_compress_level().
|
||||
*
|
||||
* This class always decomposes the entire set of num_poly polynomials.
|
||||
*/
|
||||
#pragma once
|
||||
template <typename T, class params> class GadgetMatrix {
|
||||
private:
|
||||
@@ -11,19 +20,22 @@ private:
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
uint32_t num_poly;
|
||||
T offset;
|
||||
int current_level;
|
||||
T mask_mod_b;
|
||||
T *state;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state)
|
||||
: base_log(base_log), level_count(level_count), state(state) {
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
|
||||
uint32_t num_poly)
|
||||
: base_log(base_log), level_count(level_count), num_poly(num_poly),
|
||||
state(state) {
|
||||
|
||||
mask_mod_b = (1ll << base_log) - 1ll;
|
||||
current_level = level_count;
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
for (int i = 0; i < num_poly * params::opt; i++) {
|
||||
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
@@ -31,26 +43,31 @@ public:
|
||||
}
|
||||
|
||||
__device__ void decompose_and_compress_next(double2 *result) {
|
||||
int tid = threadIdx.x;
|
||||
current_level -= 1;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
T res_re = state[tid] & mask_mod_b;
|
||||
T res_im = state[tid + params::degree / 2] & mask_mod_b;
|
||||
state[tid] >>= base_log;
|
||||
state[tid + params::degree / 2] >>= base_log;
|
||||
T carry_re = ((res_re - 1ll) | state[tid]) & res_re;
|
||||
T carry_im = ((res_im - 1ll) | state[tid + params::degree / 2]) & res_im;
|
||||
carry_re >>= (base_log - 1);
|
||||
carry_im >>= (base_log - 1);
|
||||
state[tid] += carry_re;
|
||||
state[tid + params::degree / 2] += carry_im;
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
for (int j = 0; j < num_poly; j++) {
|
||||
int tid = threadIdx.x;
|
||||
auto state_slice = state + j * params::degree;
|
||||
auto result_slice = result + j * params::degree / 2;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
T res_re = state_slice[tid] & mask_mod_b;
|
||||
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
|
||||
state_slice[tid] >>= base_log;
|
||||
state_slice[tid + params::degree / 2] >>= base_log;
|
||||
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
|
||||
T carry_im =
|
||||
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
|
||||
carry_re >>= (base_log - 1);
|
||||
carry_im >>= (base_log - 1);
|
||||
state_slice[tid] += carry_re;
|
||||
state_slice[tid + params::degree / 2] += carry_im;
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
|
||||
result[tid].x = (int32_t)res_re;
|
||||
result[tid].y = (int32_t)res_im;
|
||||
result_slice[tid].x = (int32_t)res_re;
|
||||
result_slice[tid].y = (int32_t)res_im;
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
@@ -56,36 +56,78 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
|
||||
}
|
||||
|
||||
/*
|
||||
* Receives num_poly concatenated polynomials of type T. For each:
|
||||
*
|
||||
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
|
||||
* Performs acc = 0 if zeroAcc
|
||||
* takes single buffer and calculates inplace.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
|
||||
uint32_t j,
|
||||
bool zeroAcc) {
|
||||
int tid = threadIdx.x;
|
||||
uint32_t j, bool zeroAcc,
|
||||
uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
if (zeroAcc) {
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
accumulator[tid] = 0;
|
||||
tid += block_size;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
|
||||
T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
if (zeroAcc) {
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
accumulator_slice[tid] = 0;
|
||||
tid += block_size;
|
||||
}
|
||||
} else {
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < degree - j) {
|
||||
accumulator_slice[tid] = input_slice[tid + j];
|
||||
} else {
|
||||
accumulator_slice[tid] = -input_slice[tid - degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < degree - jj) {
|
||||
accumulator_slice[tid] = -input_slice[tid + jj];
|
||||
} else {
|
||||
accumulator_slice[tid] = input_slice[tid - degree + jj];
|
||||
}
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
tid = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Receives num_poly concatenated polynomials of type T. For each:
|
||||
*
|
||||
* Performs result_acc = acc * (X^ä - 1) - acc
|
||||
* takes single buffer as input and returns a single rotated buffer
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
|
||||
T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree);
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < degree - j) {
|
||||
accumulator[tid] = input[tid + j];
|
||||
if (tid < j) {
|
||||
result_acc_slice[tid] = -acc_slice[tid - j + degree] - acc_slice[tid];
|
||||
} else {
|
||||
accumulator[tid] = -input[tid - degree + j];
|
||||
result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < degree - jj) {
|
||||
accumulator[tid] = -input[tid + jj];
|
||||
if (tid < jj) {
|
||||
result_acc_slice[tid] = acc_slice[tid - jj + degree] - acc_slice[tid];
|
||||
|
||||
} else {
|
||||
accumulator[tid] = input[tid - degree + jj];
|
||||
result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
|
||||
}
|
||||
}
|
||||
tid += block_size;
|
||||
@@ -94,54 +136,28 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
|
||||
}
|
||||
|
||||
/*
|
||||
* Performs result_acc = acc * (X^ä - 1) - acc
|
||||
* takes single buffer as input returns single rotated buffer
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < j) {
|
||||
result_acc[tid] = -acc[tid - j + degree] - acc[tid];
|
||||
} else {
|
||||
result_acc[tid] = acc[tid - j] - acc[tid];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < jj) {
|
||||
result_acc[tid] = acc[tid - jj + degree] - acc[tid];
|
||||
|
||||
} else {
|
||||
result_acc[tid] = -acc[tid - jj] - acc[tid];
|
||||
}
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* performs a rounding to increase accuracy of the PBS
|
||||
* calculates inplace.
|
||||
* Receives num_poly concatenated polynomials of type T. For each performs a
|
||||
* rounding to increase accuracy of the PBS. Calculates inplace.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
|
||||
int level_count) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
|
||||
T x_acc = rotated_acc[tid];
|
||||
T shift = sizeof(T) * 8 - level_count * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b_acc = (x_acc & mask) >> (shift - 1);
|
||||
T res_acc = x_acc >> shift;
|
||||
res_acc += b_acc;
|
||||
res_acc <<= shift;
|
||||
rotated_acc[tid] = res_acc;
|
||||
tid = tid + block_size;
|
||||
int level_count,
|
||||
uint32_t num_poly) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
T x_acc = rotated_acc_slice[tid];
|
||||
T shift = sizeof(T) * 8 - level_count * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b_acc = (x_acc & mask) >> (shift - 1);
|
||||
T res_acc = x_acc >> shift;
|
||||
res_acc += b_acc;
|
||||
res_acc <<= shift;
|
||||
rotated_acc_slice[tid] = res_acc;
|
||||
tid = tid + block_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,80 +192,85 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
|
||||
}
|
||||
}
|
||||
|
||||
// Extracts the body of a GLWE with dimension glwe_dimension
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator) {
|
||||
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
|
||||
uint32_t glwe_dimension) {
|
||||
// Set first coefficient of the accumulator as the body of the LWE sample
|
||||
lwe_array_out[params::degree] = accumulator[0];
|
||||
lwe_array_out[glwe_dimension * params::degree] =
|
||||
accumulator[glwe_dimension * params::degree];
|
||||
}
|
||||
|
||||
// Extracts the mask of a GLWE with dimension glwe_dimension
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator) {
|
||||
// Set ACC = -ACC
|
||||
// accumulator.negate_inplace();
|
||||
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
|
||||
uint32_t glwe_dimension) {
|
||||
for (int z = 0; z < glwe_dimension; z++) {
|
||||
Torus *lwe_array_out_slice =
|
||||
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
|
||||
Torus *accumulator_slice =
|
||||
(Torus *)accumulator + (ptrdiff_t)(z * params::degree);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
// Set ACC = -ACC
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = -accumulator[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Reverse the accumulator
|
||||
// accumulator.reverse_inplace();
|
||||
|
||||
tid = threadIdx.x;
|
||||
Torus result[params::opt];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[i] = accumulator[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform ACC * X
|
||||
// accumulator.multiply_by_monomial_negacyclic_inplace(1);
|
||||
|
||||
tid = threadIdx.x;
|
||||
result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (1 < params::degree) {
|
||||
if (tid < 1)
|
||||
result[i] = -accumulator[tid - 1 + params::degree];
|
||||
else
|
||||
result[i] = accumulator[tid - 1];
|
||||
} else {
|
||||
uint32_t jj = 1 - (uint32_t)params::degree;
|
||||
if (tid < jj)
|
||||
result[i] = accumulator[tid - jj + params::degree];
|
||||
else
|
||||
result[i] = -accumulator[tid - jj];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator_slice[tid] = -accumulator_slice[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = result[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Copy to the mask of the LWE sample
|
||||
// accumulator.copy_into(lwe_array_out);
|
||||
|
||||
tid = threadIdx.x;
|
||||
// Reverse the accumulator
|
||||
tid = threadIdx.x;
|
||||
Torus result[params::opt];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
lwe_array_out[tid] = accumulator[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[i] = accumulator_slice[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator_slice[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform ACC * X
|
||||
// (equivalent to multiply_by_monomial_negacyclic_inplace(1))
|
||||
tid = threadIdx.x;
|
||||
result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (1 < params::degree) {
|
||||
if (tid < 1)
|
||||
result[i] = -accumulator_slice[tid - 1 + params::degree];
|
||||
else
|
||||
result[i] = accumulator_slice[tid - 1];
|
||||
} else {
|
||||
uint32_t jj = 1 - (uint32_t)params::degree;
|
||||
if (tid < jj)
|
||||
result[i] = accumulator_slice[tid - jj + params::degree];
|
||||
else
|
||||
result[i] = -accumulator_slice[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator_slice[tid] = result[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Copy to the mask of the LWE sample
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
lwe_array_out_slice[tid] = accumulator_slice[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -101,8 +101,10 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
|
||||
GadgetMatrix<Torus, params> gadget_mask(base_log, level_count, glwe_sub_mask);
|
||||
GadgetMatrix<Torus, params> gadget_body(base_log, level_count, glwe_sub_body);
|
||||
GadgetMatrix<Torus, params> gadget_mask(base_log, level_count, glwe_sub_mask,
|
||||
1);
|
||||
GadgetMatrix<Torus, params> gadget_body(base_log, level_count, glwe_sub_body,
|
||||
1);
|
||||
// Subtract each glwe operand, decompose the resulting
|
||||
// polynomial coefficients to multiply each decomposed level
|
||||
// with the corresponding part of the LUT
|
||||
@@ -420,12 +422,12 @@ __global__ void device_blind_rotation_and_sample_extraction(
|
||||
// Body
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_c1, accumulator_c0, (1 << monomial_degree), false);
|
||||
accumulator_c1, accumulator_c0, (1 << monomial_degree), false, 1);
|
||||
// Mask
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_c1 + polynomial_size, accumulator_c0 + polynomial_size,
|
||||
(1 << monomial_degree), false);
|
||||
(1 << monomial_degree), false, 1);
|
||||
|
||||
monomial_degree += 1;
|
||||
|
||||
@@ -445,9 +447,8 @@ __global__ void device_blind_rotation_and_sample_extraction(
|
||||
// Now we can perform the sample extraction: for the body it's just
|
||||
// the resulting constant coefficient of the accumulator
|
||||
// For the mask it's more complicated
|
||||
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0);
|
||||
sample_extract_body<Torus, params>(block_lwe_out,
|
||||
accumulator_c0 + polynomial_size);
|
||||
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0, 1);
|
||||
sample_extract_body<Torus, params>(block_lwe_out, accumulator_c0, 1);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, class params>
|
||||
|
||||
Reference in New Issue
Block a user