refactor(cuda): Refactor the amortized PBS.

This commit is contained in:
Pedro Alves
2023-02-06 14:45:22 -03:00
committed by Agnès Leroy
parent 2a299664e7
commit 41be2b4832
8 changed files with 321 additions and 318 deletions

View File

@@ -13,8 +13,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
assert(
("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
@@ -26,36 +24,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
host_bootstrap_amortized<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
default:
break;
@@ -136,8 +139,6 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
assert(
("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
assert(("Error (GPU amortized PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
@@ -149,36 +150,41 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
host_bootstrap_amortized<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 1024:
host_bootstrap_amortized<uint64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 2048:
host_bootstrap_amortized<uint64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 4096:
host_bootstrap_amortized<uint64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 8192:
host_bootstrap_amortized<uint64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, lwe_idx, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
default:
break;

View File

@@ -53,8 +53,8 @@ template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_bootstrap_amortized(
Torus *lwe_array_out, Torus *lut_vector, uint32_t *lut_vector_indexes,
Torus *lwe_array_in, double2 *bootstrapping_key, char *device_mem,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lwe_idx,
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
size_t device_memory_size_per_sample) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -69,26 +69,22 @@ __global__ void device_bootstrap_amortized(
// For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
// one mask polynomial and 1 body to handle.
Torus *accumulator_mask = (Torus *)selected_memory;
Torus *accumulator_body =
(Torus *)accumulator_mask + (ptrdiff_t)polynomial_size;
Torus *accumulator_mask_rotated =
(Torus *)accumulator_body + (ptrdiff_t)polynomial_size;
Torus *accumulator_body_rotated =
(Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size;
double2 *mask_res_fft = (double2 *)accumulator_body_rotated +
polynomial_size / (sizeof(double2) / sizeof(Torus));
double2 *body_res_fft =
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
Torus *accumulator = (Torus *)selected_memory;
Torus *accumulator_rotated =
(Torus *)accumulator +
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
double2 *res_fft =
(double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
(sizeof(double2) / sizeof(Torus));
double2 *accumulator_fft = (double2 *)sharedmem;
if constexpr (SMD != PARTIALSM)
accumulator_fft =
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
accumulator_fft = (double2 *)res_fft +
(ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
auto block_lwe_array_in = &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
2];
(glwe_dimension + 1)];
// Put "b", the body, in [0, 2N[
Torus b_hat = 0;
@@ -97,11 +93,7 @@ __global__ void device_bootstrap_amortized(
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_mask, block_lut_vector, b_hat, false);
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_body, &block_lut_vector[params::degree], b_hat, false);
accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
// Loop over all the mask elements of the sample to accumulate
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
@@ -118,11 +110,7 @@ __global__ void device_bootstrap_amortized(
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator_mask, accumulator_mask_rotated, a_hat);
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator_body, accumulator_body_rotated, a_hat);
accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
synchronize_threads_in_block();
@@ -130,153 +118,122 @@ __global__ void device_bootstrap_amortized(
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_mask_rotated, base_log, level_count);
accumulator_rotated, base_log, level_count, glwe_dimension + 1);
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_body_rotated, base_log, level_count);
// Initialize the polynomial multiplication via FFT arrays
// The polynomial multiplications happens at the block level
// and each thread handles two or more coefficients
int pos = threadIdx.x;
for (int j = 0; j < params::opt / 2; j++) {
mask_res_fft[pos].x = 0;
mask_res_fft[pos].y = 0;
body_res_fft[pos].x = 0;
body_res_fft[pos].y = 0;
for (int j = 0; j < (glwe_dimension + 1) * params::opt / 2; j++) {
res_fft[pos].x = 0;
res_fft[pos].y = 0;
pos += params::degree / params::opt;
}
GadgetMatrix<Torus, params> gadget_mask(base_log, level_count,
accumulator_mask_rotated);
GadgetMatrix<Torus, params> gadget_body(base_log, level_count,
accumulator_body_rotated);
GadgetMatrix<Torus, params> gadget(base_log, level_count,
accumulator_rotated, glwe_dimension + 1);
// Now that the rotation is done, decompose the resulting polynomial
// coefficients so as to multiply each decomposed level with the
// corresponding part of the bootstrapping key
for (int level = level_count - 1; level >= 0; level--) {
gadget.decompose_and_compress_next(accumulator_fft);
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_fft_slice = accumulator_fft + i * params::degree / 2;
gadget_mask.decompose_and_compress_next(accumulator_fft);
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(accumulator_fft_slice);
synchronize_threads_in_block();
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
level, polynomial_size,
glwe_dimension, level_count);
synchronize_threads_in_block();
// Perform the coefficient-wise product with the two pieces of
// bootstrapping key
for (int j = 0; j < (glwe_dimension + 1); j++) {
auto bsk_poly = bsk_slice + j * params::degree / 2;
auto res_fft_poly = res_fft + j * params::degree / 2;
polynomial_product_accumulate_in_fourier_domain<params, double2>(
res_fft_poly, accumulator_fft_slice, bsk_poly);
}
}
synchronize_threads_in_block();
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_mask_slice =
get_ith_mask_kth_block(bootstrapping_key, iteration, 0, level,
polynomial_size, 1, level_count);
auto bsk_body_slice =
get_ith_body_kth_block(bootstrapping_key, iteration, 0, level,
polynomial_size, 1, level_count);
synchronize_threads_in_block();
// Perform the coefficient-wise product with the two pieces of
// bootstrapping key
polynomial_product_accumulate_in_fourier_domain<params, double2>(
mask_res_fft, accumulator_fft, bsk_mask_slice);
polynomial_product_accumulate_in_fourier_domain<params, double2>(
body_res_fft, accumulator_fft, bsk_body_slice);
synchronize_threads_in_block();
gadget_body.decompose_and_compress_next(accumulator_fft);
// Now handle the polynomial multiplication for the body
// in the same way
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
auto bsk_mask_slice_2 =
get_ith_mask_kth_block(bootstrapping_key, iteration, 1, level,
polynomial_size, 1, level_count);
auto bsk_body_slice_2 =
get_ith_body_kth_block(bootstrapping_key, iteration, 1, level,
polynomial_size, 1, level_count);
synchronize_threads_in_block();
polynomial_product_accumulate_in_fourier_domain<params, double2>(
mask_res_fft, accumulator_fft, bsk_mask_slice_2);
polynomial_product_accumulate_in_fourier_domain<params, double2>(
body_res_fft, accumulator_fft, bsk_body_slice_2);
}
// Come back to the coefficient representation
if constexpr (SMD == FULLSM || SMD == NOSM) {
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(mask_res_fft);
NSMFFT_inverse<HalfDegree<params>>(body_res_fft);
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto res_fft_slice = res_fft + i * params::degree / 2;
NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
}
synchronize_threads_in_block();
add_to_torus<Torus, params>(mask_res_fft, accumulator_mask);
add_to_torus<Torus, params>(body_res_fft, accumulator_body);
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_slice = accumulator + i * params::degree;
auto res_fft_slice = res_fft + i * params::degree / 2;
add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
}
synchronize_threads_in_block();
} else {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
accumulator_fft[tid] = mask_res_fft[tid];
tid = tid + params::degree / params::opt;
for (int i = 0; i < (glwe_dimension + 1); i++) {
auto accumulator_slice = accumulator + i * params::degree;
auto res_fft_slice = res_fft + i * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
accumulator_fft[tid] = res_fft_slice[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
}
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_mask);
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
accumulator_fft[tid] = body_res_fft[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_body);
synchronize_threads_in_block();
}
}
auto block_lwe_array_out = &lwe_array_out[blockIdx.x * (polynomial_size + 1)];
auto block_lwe_array_out =
&lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1)];
// The blind rotation for this block is over
// Now we can perform the sample extraction: for the body it's just
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator_mask);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator_body);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
glwe_dimension);
}
template <typename Torus, class params>
__host__ void host_bootstrap_amortized(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
uint32_t *lut_vector_indexes, Torus *lwe_array_in,
double2 *bootstrapping_key, uint32_t input_lwe_dimension,
double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t lwe_idx, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
sizeof(Torus) * polynomial_size + // accumulator body
sizeof(Torus) * polynomial_size + // accumulator mask rotated
sizeof(Torus) * polynomial_size + // accumulator body rotated
sizeof(double2) * polynomial_size / 2 + // accumulator fft mask
sizeof(double2) * polynomial_size / 2 + // accumulator fft body
sizeof(double2) * polynomial_size / 2; // calculate buffer fft
int SM_FULL =
sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
sizeof(Torus) * polynomial_size *
(glwe_dimension + 1) + // accumulator rotated
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1) + // accumulator fft
sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1); // calculate buffer fft
int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft
int SM_PART = sizeof(double2) * polynomial_size / 2 *
(glwe_dimension + 1); // calculate buffer fft
int DM_PART = SM_FULL - SM_PART;
@@ -304,8 +261,8 @@ __host__ void host_bootstrap_amortized(
stream, gpu_index);
device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
base_log, level_count, lwe_idx, DM_FULL);
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
} else if (max_shared_memory < SM_FULL) {
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART);
@@ -316,8 +273,8 @@ __host__ void host_bootstrap_amortized(
device_bootstrap_amortized<Torus, params, PARTIALSM>
<<<grid, thds, SM_PART, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
base_log, level_count, lwe_idx, DM_PART);
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, DM_PART);
} else {
// For devices with compute capability 7.x a single thread block can
// address the full capacity of shared memory. Shared memory on the
@@ -335,11 +292,10 @@ __host__ void host_bootstrap_amortized(
device_bootstrap_amortized<Torus, params, FULLSM>
<<<grid, thds, SM_FULL, *stream>>>(
lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
bootstrapping_key, d_mem, input_lwe_dimension, polynomial_size,
base_log, level_count, lwe_idx, 0);
bootstrapping_key, d_mem, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, lwe_idx, 0);
}
check_cuda_error(cudaGetLastError());
cuda_drop_async(d_mem, stream, gpu_index);
}

View File

@@ -182,11 +182,11 @@ __global__ void device_bootstrap_low_latency(
if (blockIdx.y == 0) {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, block_lut_vector, b_hat, false);
accumulator, block_lut_vector, b_hat, false, 1);
} else {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[params::degree], b_hat, false);
accumulator, &block_lut_vector[params::degree], b_hat, false, 1);
}
for (int i = 0; i < lwe_dimension; i++) {
@@ -200,13 +200,13 @@ __global__ void device_bootstrap_low_latency(
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat);
accumulator, accumulator_rotated, a_hat, 1);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count);
accumulator_rotated, base_log, level_count, 1);
synchronize_threads_in_block();
@@ -214,7 +214,7 @@ __global__ void device_bootstrap_low_latency(
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
accumulator_rotated, 1);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
@@ -234,9 +234,9 @@ __global__ void device_bootstrap_low_latency(
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1);
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator);
block_lwe_array_out[params::degree] = accumulator[0];
}
}

View File

@@ -144,8 +144,8 @@ __host__ void host_circuit_bootstrap(
host_bootstrap_amortized<Torus, params>(
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk,
lwe_dimension, polynomial_size, base_log_bsk, level_bsk, pbs_count,
level_cbs, 0, max_shared_memory);
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
pbs_count, level_cbs, 0, max_shared_memory);
dim3 copy_grid(pbs_count * (glwe_dimension + 1), 1, 1);
dim3 copy_block(params::degree / params::opt, 1, 1);

View File

@@ -15,6 +15,7 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
level_count;
}
////////////////////////////////////////////////
template <typename T>
__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
uint32_t polynomial_size,
@@ -35,8 +36,9 @@ __device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
level * polynomial_size / 2 * (glwe_dimension + 1) *
(glwe_dimension + 1) +
k * polynomial_size / 2 * (glwe_dimension + 1) +
polynomial_size / 2];
glwe_dimension * polynomial_size / 2];
}
////////////////////////////////////////////////
template <typename T, typename ST>
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,

View File

@@ -4,6 +4,15 @@
#include "polynomial/polynomial.cuh"
#include <cstdint>
/**
* GadgetMatrix implements the iterator design pattern to decompose a set of
* num_poly consecutive polynomials with degree params::degree. A total of
* level_count levels is expected and each call to decompose_and_compress_next()
* writes to the result the next level. It is also possible to advance an
* arbitrary amount of levels by using decompose_and_compress_level().
*
* This class always decomposes the entire set of num_poly polynomials.
*/
#pragma once
template <typename T, class params> class GadgetMatrix {
private:
@@ -11,19 +20,22 @@ private:
uint32_t base_log;
uint32_t mask;
uint32_t halfbg;
uint32_t num_poly;
T offset;
int current_level;
T mask_mod_b;
T *state;
public:
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state)
: base_log(base_log), level_count(level_count), state(state) {
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
uint32_t num_poly)
: base_log(base_log), level_count(level_count), num_poly(num_poly),
state(state) {
mask_mod_b = (1ll << base_log) - 1ll;
current_level = level_count;
int tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
for (int i = 0; i < num_poly * params::opt; i++) {
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
tid += params::degree / params::opt;
}
@@ -31,26 +43,31 @@ public:
}
__device__ void decompose_and_compress_next(double2 *result) {
int tid = threadIdx.x;
current_level -= 1;
for (int i = 0; i < params::opt / 2; i++) {
T res_re = state[tid] & mask_mod_b;
T res_im = state[tid + params::degree / 2] & mask_mod_b;
state[tid] >>= base_log;
state[tid + params::degree / 2] >>= base_log;
T carry_re = ((res_re - 1ll) | state[tid]) & res_re;
T carry_im = ((res_im - 1ll) | state[tid + params::degree / 2]) & res_im;
carry_re >>= (base_log - 1);
carry_im >>= (base_log - 1);
state[tid] += carry_re;
state[tid + params::degree / 2] += carry_im;
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
for (int j = 0; j < num_poly; j++) {
int tid = threadIdx.x;
auto state_slice = state + j * params::degree;
auto result_slice = result + j * params::degree / 2;
for (int i = 0; i < params::opt / 2; i++) {
T res_re = state_slice[tid] & mask_mod_b;
T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
state_slice[tid] >>= base_log;
state_slice[tid + params::degree / 2] >>= base_log;
T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
T carry_im =
((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
carry_re >>= (base_log - 1);
carry_im >>= (base_log - 1);
state_slice[tid] += carry_re;
state_slice[tid + params::degree / 2] += carry_im;
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
result[tid].x = (int32_t)res_re;
result[tid].y = (int32_t)res_im;
result_slice[tid].x = (int32_t)res_re;
result_slice[tid].y = (int32_t)res_im;
tid += params::degree / params::opt;
tid += params::degree / params::opt;
}
}
synchronize_threads_in_block();
}

View File

@@ -56,36 +56,78 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
}
/*
* Receives num_poly concatenated polynomials of type T. For each:
*
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
* Performs acc = 0 if zeroAcc
* takes single buffer and calculates inplace.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
uint32_t j,
bool zeroAcc) {
int tid = threadIdx.x;
uint32_t j, bool zeroAcc,
uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
if (zeroAcc) {
for (int i = 0; i < elems_per_thread; i++) {
accumulator[tid] = 0;
tid += block_size;
for (int z = 0; z < num_poly; z++) {
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
if (zeroAcc) {
for (int i = 0; i < elems_per_thread; i++) {
accumulator_slice[tid] = 0;
tid += block_size;
}
} else {
tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
if (tid < degree - j) {
accumulator_slice[tid] = input_slice[tid + j];
} else {
accumulator_slice[tid] = -input_slice[tid - degree + j];
}
} else {
uint32_t jj = j - degree;
if (tid < degree - jj) {
accumulator_slice[tid] = -input_slice[tid + jj];
} else {
accumulator_slice[tid] = input_slice[tid - degree + jj];
}
}
tid += block_size;
}
}
} else {
tid = threadIdx.x;
}
}
/*
* Receives num_poly concatenated polynomials of type T. For each:
*
* Performs result_acc = acc * (X^ä - 1) - acc
* takes single buffer as input and returns a single rotated buffer
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
if (tid < degree - j) {
accumulator[tid] = input[tid + j];
if (tid < j) {
result_acc_slice[tid] = -acc_slice[tid - j + degree] - acc_slice[tid];
} else {
accumulator[tid] = -input[tid - degree + j];
result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
}
} else {
uint32_t jj = j - degree;
if (tid < degree - jj) {
accumulator[tid] = -input[tid + jj];
if (tid < jj) {
result_acc_slice[tid] = acc_slice[tid - jj + degree] - acc_slice[tid];
} else {
accumulator[tid] = input[tid - degree + jj];
result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
}
}
tid += block_size;
@@ -94,54 +136,28 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
}
/*
* Performs result_acc = acc * (X^ä - 1) - acc
* takes single buffer as input returns single rotated buffer
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void
multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc,
uint32_t j) {
int tid = threadIdx.x;
constexpr int degree = block_size * elems_per_thread;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
if (tid < j) {
result_acc[tid] = -acc[tid - j + degree] - acc[tid];
} else {
result_acc[tid] = acc[tid - j] - acc[tid];
}
} else {
uint32_t jj = j - degree;
if (tid < jj) {
result_acc[tid] = acc[tid - jj + degree] - acc[tid];
} else {
result_acc[tid] = -acc[tid - jj] - acc[tid];
}
}
tid += block_size;
}
}
/*
* performs a rounding to increase accuracy of the PBS
* calculates inplace.
* Receives num_poly concatenated polynomials of type T. For each performs a
* rounding to increase accuracy of the PBS. Calculates inplace.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
int level_count) {
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
T x_acc = rotated_acc[tid];
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b_acc = (x_acc & mask) >> (shift - 1);
T res_acc = x_acc >> shift;
res_acc += b_acc;
res_acc <<= shift;
rotated_acc[tid] = res_acc;
tid = tid + block_size;
int level_count,
uint32_t num_poly) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
int tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
T x_acc = rotated_acc_slice[tid];
T shift = sizeof(T) * 8 - level_count * base_log;
T mask = 1ll << (shift - 1);
T b_acc = (x_acc & mask) >> (shift - 1);
T res_acc = x_acc >> shift;
res_acc += b_acc;
res_acc <<= shift;
rotated_acc_slice[tid] = res_acc;
tid = tid + block_size;
}
}
}
@@ -176,80 +192,85 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
}
}
// Extracts the body of a GLWE with dimension glwe_dimension
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator) {
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
uint32_t glwe_dimension) {
// Set first coefficient of the accumulator as the body of the LWE sample
lwe_array_out[params::degree] = accumulator[0];
lwe_array_out[glwe_dimension * params::degree] =
accumulator[glwe_dimension * params::degree];
}
// Extracts the mask of a GLWE with dimension glwe_dimension
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator) {
// Set ACC = -ACC
// accumulator.negate_inplace();
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
uint32_t glwe_dimension) {
for (int z = 0; z < glwe_dimension; z++) {
Torus *lwe_array_out_slice =
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
Torus *accumulator_slice =
(Torus *)accumulator + (ptrdiff_t)(z * params::degree);
int tid = threadIdx.x;
// Set ACC = -ACC
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = -accumulator[tid];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Reverse the accumulator
// accumulator.reverse_inplace();
tid = threadIdx.x;
Torus result[params::opt];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
result[i] = accumulator[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = result[i];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Perform ACC * X
// accumulator.multiply_by_monomial_negacyclic_inplace(1);
tid = threadIdx.x;
result[params::opt];
for (int i = 0; i < params::opt; i++) {
if (1 < params::degree) {
if (tid < 1)
result[i] = -accumulator[tid - 1 + params::degree];
else
result[i] = accumulator[tid - 1];
} else {
uint32_t jj = 1 - (uint32_t)params::degree;
if (tid < jj)
result[i] = accumulator[tid - jj + params::degree];
else
result[i] = -accumulator[tid - jj];
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = -accumulator_slice[tid];
tid = tid + params::degree / params::opt;
}
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator[tid] = result[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
synchronize_threads_in_block();
// Copy to the mask of the LWE sample
// accumulator.copy_into(lwe_array_out);
tid = threadIdx.x;
// Reverse the accumulator
tid = threadIdx.x;
Torus result[params::opt];
#pragma unroll
for (int i = 0; i < params::opt; i++) {
lwe_array_out[tid] = accumulator[tid];
tid = tid + params::degree / params::opt;
for (int i = 0; i < params::opt; i++) {
result[i] = accumulator_slice[params::degree - tid - 1];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = result[i];
tid = tid + params::degree / params::opt;
}
synchronize_threads_in_block();
// Perform ACC * X
// (equivalent to multiply_by_monomial_negacyclic_inplace(1))
tid = threadIdx.x;
result[params::opt];
for (int i = 0; i < params::opt; i++) {
if (1 < params::degree) {
if (tid < 1)
result[i] = -accumulator_slice[tid - 1 + params::degree];
else
result[i] = accumulator_slice[tid - 1];
} else {
uint32_t jj = 1 - (uint32_t)params::degree;
if (tid < jj)
result[i] = accumulator_slice[tid - jj + params::degree];
else
result[i] = -accumulator_slice[tid - jj];
}
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
tid = threadIdx.x;
for (int i = 0; i < params::opt; i++) {
accumulator_slice[tid] = result[i];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// Copy to the mask of the LWE sample
tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
lwe_array_out_slice[tid] = accumulator_slice[tid];
tid = tid + params::degree / params::opt;
}
}
}

View File

@@ -101,8 +101,10 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,
pos += params::degree / params::opt;
}
GadgetMatrix<Torus, params> gadget_mask(base_log, level_count, glwe_sub_mask);
GadgetMatrix<Torus, params> gadget_body(base_log, level_count, glwe_sub_body);
GadgetMatrix<Torus, params> gadget_mask(base_log, level_count, glwe_sub_mask,
1);
GadgetMatrix<Torus, params> gadget_body(base_log, level_count, glwe_sub_body,
1);
// Subtract each glwe operand, decompose the resulting
// polynomial coefficients to multiply each decomposed level
// with the corresponding part of the LUT
@@ -420,12 +422,12 @@ __global__ void device_blind_rotation_and_sample_extraction(
// Body
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_c1, accumulator_c0, (1 << monomial_degree), false);
accumulator_c1, accumulator_c0, (1 << monomial_degree), false, 1);
// Mask
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_c1 + polynomial_size, accumulator_c0 + polynomial_size,
(1 << monomial_degree), false);
(1 << monomial_degree), false, 1);
monomial_degree += 1;
@@ -445,9 +447,8 @@ __global__ void device_blind_rotation_and_sample_extraction(
// Now we can perform the sample extraction: for the body it's just
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0);
sample_extract_body<Torus, params>(block_lwe_out,
accumulator_c0 + polynomial_size);
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0, 1);
sample_extract_body<Torus, params>(block_lwe_out, accumulator_c0, 1);
}
template <typename Torus, typename STorus, class params>