refactor(cuda): Refactor the low latency PBS.

This commit is contained in:
Pedro Alves
2023-02-09 15:00:40 -03:00
committed by Agnès Leroy
parent 41be2b4832
commit db83fd7649
10 changed files with 165 additions and 170 deletions

View File

@@ -15,8 +15,6 @@ void cuda_extract_bits_32(
uint32_t max_shared_memory) {
assert(("Error (GPU extract bits): base log should be <= 32",
base_log_bsk <= 32));
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
"512, 1024, 2048, 4096, 8192",
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
@@ -44,8 +42,8 @@ void cuda_extract_bits_32(
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
case 1024:
host_extract_bits<uint32_t, Degree<1024>>(
@@ -56,8 +54,8 @@ void cuda_extract_bits_32(
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
case 2048:
host_extract_bits<uint32_t, Degree<2048>>(
@@ -68,8 +66,8 @@ void cuda_extract_bits_32(
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
case 4096:
host_extract_bits<uint32_t, Degree<4096>>(
@@ -80,8 +78,8 @@ void cuda_extract_bits_32(
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
case 8192:
host_extract_bits<uint32_t, Degree<8192>>(
@@ -92,8 +90,8 @@ void cuda_extract_bits_32(
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
default:
break;
@@ -157,8 +155,6 @@ void cuda_extract_bits_64(
uint32_t max_shared_memory) {
assert(("Error (GPU extract bits): base log should be <= 64",
base_log_bsk <= 64));
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
"512, 1024, 2048, 4096, 8192",
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
@@ -186,8 +182,8 @@ void cuda_extract_bits_64(
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
case 1024:
host_extract_bits<uint64_t, Degree<1024>>(
@@ -198,8 +194,8 @@ void cuda_extract_bits_64(
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
case 2048:
host_extract_bits<uint64_t, Degree<2048>>(
@@ -210,8 +206,8 @@ void cuda_extract_bits_64(
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
case 4096:
host_extract_bits<uint64_t, Degree<4096>>(
@@ -222,8 +218,8 @@ void cuda_extract_bits_64(
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
case 8192:
host_extract_bits<uint64_t, Degree<8192>>(
@@ -234,8 +230,8 @@ void cuda_extract_bits_64(
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_samples, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_samples, max_shared_memory);
break;
default:
break;

View File

@@ -140,9 +140,9 @@ __host__ void host_extract_bits(
Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs,
uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk,
uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in,
uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk,
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
uint32_t max_shared_memory) {
uint32_t lwe_dimension_out, uint32_t glwe_dimension, uint32_t base_log_bsk,
uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk,
uint32_t number_of_samples, uint32_t max_shared_memory) {
cudaSetDevice(gpu_index);
auto stream = static_cast<cudaStream_t *>(v_stream);
@@ -189,8 +189,8 @@ __host__ void host_extract_bits(
host_bootstrap_low_latency<Torus, params>(
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk,
lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk,
number_of_samples, 1, max_shared_memory);
glwe_dimension, lwe_dimension_out, lwe_dimension_in, base_log_bsk,
level_count_bsk, number_of_samples, 1, max_shared_memory);
// Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
// of 0 if the extracted bit was 0 and 1 in the other case

View File

@@ -16,8 +16,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
assert(("Error (GPU low latency PBS): base log should be <= 32",
base_log <= 32));
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
@@ -39,36 +37,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
host_bootstrap_low_latency<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 1024:
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 2048:
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 4096:
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 8192:
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
default:
break;
@@ -160,8 +163,6 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
assert(("Error (GPU low latency PBS): base log should be <= 64",
base_log <= 64));
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
@@ -183,36 +184,41 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
host_bootstrap_low_latency<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 1024:
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 2048:
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 4096:
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 8192:
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
level_count, num_samples, num_lut_vectors, max_shared_memory);
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
default:
break;

View File

@@ -25,11 +25,11 @@ using namespace cooperative_groups;
namespace cg = cooperative_groups;
template <typename Torus, class params>
__device__ void
mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
double2 *body_join_buffer, double2 *bootstrapping_key,
int polynomial_size, int level_count, int iteration,
grid_group &grid) {
__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
double2 *join_buffer, double2 *bootstrapping_key,
int polynomial_size, uint32_t glwe_dimension,
int level_count, int iteration,
grid_group &grid) {
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
@@ -40,52 +40,50 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
auto bsk_mask_slice =
get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y,
blockIdx.x, polynomial_size, 1, level_count);
auto bsk_body_slice =
get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y,
blockIdx.x, polynomial_size, 1, level_count);
// Selects all GLWEs in a particular decomposition level
auto level_join_buffer =
join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
auto first_processed_bsk =
(blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
auto second_processed_bsk =
(blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
auto first_processed_acc =
(blockIdx.y == 0) ? &mask_join_buffer[params::degree / 2 * blockIdx.x]
: &body_join_buffer[params::degree / 2 * blockIdx.x];
auto second_processed_acc =
(blockIdx.y == 0) ? &body_join_buffer[params::degree / 2 * blockIdx.x]
: &mask_join_buffer[params::degree / 2 * blockIdx.x];
// The first product is used to initialize level_join_buffer
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
int tid = threadIdx.x;
// first product
for (int i = 0; i < params::opt / 2; i++) {
first_processed_acc[tid] = fft[tid] * first_processed_bsk[tid];
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
tid = threadIdx.x;
// second product
for (int i = 0; i < params::opt / 2; i++) {
second_processed_acc[tid] += fft[tid] * second_processed_bsk[tid];
tid += params::degree / params::opt;
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
for (int j = 1; j < (glwe_dimension + 1); j++) {
int idx = (j + blockIdx.y) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * params::degree / 2;
auto buffer_slice = level_join_buffer + idx * params::degree / 2;
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
grid.sync();
}
// -----------------------------------------------------------------
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
// All blocks are synchronized here; after this sync, *_join_buffer has the
// values needed from every other block
grid.sync();
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
// copy first product into fft buffer
tid = threadIdx.x;
@@ -97,7 +95,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
auto cur_src_acc = &src_acc[l * params::degree / 2];
auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
@@ -121,19 +119,22 @@ template <typename Torus, class params, sharedMemDegree SMD>
/*
* Kernel launched by the low latency version of the
* bootstrapping, that uses cooperative groups
* lwe_array_out vector of output lwe s, with length
* (polynomial_size+1)*num_samples lut_vector - vector of look up tables with
* length polynomial_size * num_samples lut_vector_indexes - mapping between
* lwe_array_in and lut_vector lwe_array_in
* - vector of lwe inputs with length (lwe_dimension + 1) * num_samples
*
* - lwe_array_out: vector of output lwe s, with length
* (glwe_dimension * polynomial_size+1)*num_samples
* - lut_vector: vector of look up tables with
* length (glwe_dimension+1) * polynomial_size * num_samples
* - lut_vector_indexes: mapping between lwe_array_in and lut_vector
* lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
* num_samples
*
* Each y-block computes one element of the lwe_array_out.
*/
__global__ void device_bootstrap_low_latency(
Torus *lwe_array_out, Torus *lut_vector, Torus *lwe_array_in,
double2 *bootstrapping_key, double2 *mask_join_buffer,
double2 *body_join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, char *device_mem,
int device_memory_size_per_block) {
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
char *device_mem, int device_memory_size_per_block) {
grid_group grid = this_grid();
@@ -144,6 +145,7 @@ __global__ void device_bootstrap_low_latency(
char *selected_memory;
int block_index =
blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
uint32_t glwe_dimension = gridDim.y - 1;
if constexpr (SMD == FULLSM)
selected_memory = sharedmem;
@@ -163,12 +165,12 @@ __global__ void device_bootstrap_low_latency(
// this block is operating, in the case of batch bootstraps
auto block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
auto block_lut_vector = &lut_vector[blockIdx.z * params::degree * 2];
auto block_lut_vector =
&lut_vector[blockIdx.z * params::degree * (glwe_dimension + 1)];
auto block_mask_join_buffer =
&mask_join_buffer[blockIdx.z * level_count * params::degree / 2];
auto block_body_join_buffer =
&body_join_buffer[blockIdx.z * level_count * params::degree / 2];
auto block_join_buffer =
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
// Since the space is L1 cache is small, we use the same memory location for
// the rotated accumulator and the fft accumulator, since we know that the
@@ -179,15 +181,10 @@ __global__ void device_bootstrap_low_latency(
rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
2 * params::degree);
if (blockIdx.y == 0) {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, block_lut_vector, b_hat, false, 1);
} else {
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[params::degree], b_hat, false, 1);
}
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
for (int i = 0; i < lwe_dimension; i++) {
synchronize_threads_in_block();
@@ -200,13 +197,13 @@ __global__ void device_bootstrap_low_latency(
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
accumulator, accumulator_rotated, a_hat, 1);
accumulator, accumulator_rotated, a_hat);
// Perform a rounding to increase the accuracy of the
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator_rotated, base_log, level_count, 1);
accumulator_rotated, base_log, level_count);
synchronize_threads_in_block();
@@ -214,29 +211,33 @@ __global__ void device_bootstrap_low_latency(
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated, 1);
accumulator_rotated);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
block_mask_join_buffer, block_body_join_buffer,
bootstrapping_key, polynomial_size,
level_count, i, grid);
mul_ggsw_glwe<Torus, params>(
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
polynomial_size, glwe_dimension, level_count, i, grid);
synchronize_threads_in_block();
}
auto block_lwe_array_out = &lwe_array_out[blockIdx.z * (polynomial_size + 1)];
auto block_lwe_array_out =
&lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y == 0) {
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator, 1);
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
block_lwe_array_out[params::degree] = accumulator[0];
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
}
}
@@ -248,7 +249,7 @@ template <typename Torus, class params>
__host__ void host_bootstrap_low_latency(
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
uint32_t *lut_vector_indexes, Torus *lwe_array_in,
double2 *bootstrapping_key, uint32_t lwe_dimension,
double2 *bootstrapping_key, uint32_t glwe_dimension, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
uint32_t max_shared_memory) {
@@ -258,10 +259,8 @@ __host__ void host_bootstrap_low_latency(
int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2);
double2 *mask_buffer_fft =
(double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
double2 *body_buffer_fft =
(double2 *)cuda_malloc_async(buffer_size_per_gpu, stream, gpu_index);
double2 *buffer_fft = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * buffer_size_per_gpu, stream, gpu_index);
// With SM each block corresponds to either the mask or body, no need to
// duplicate data for each
@@ -279,35 +278,34 @@ __host__ void host_bootstrap_low_latency(
char *d_mem;
int thds = polynomial_size / params::opt;
dim3 grid(level_count, 2, input_lwe_ciphertext_count);
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
void *kernel_args[12];
void *kernel_args[11];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lut_vector;
kernel_args[2] = &lwe_array_in;
kernel_args[3] = &bootstrapping_key;
kernel_args[4] = &mask_buffer_fft;
kernel_args[5] = &body_buffer_fft;
kernel_args[6] = &lwe_dimension;
kernel_args[7] = &polynomial_size;
kernel_args[8] = &base_log;
kernel_args[9] = &level_count;
kernel_args[10] = &d_mem;
kernel_args[4] = &buffer_fft;
kernel_args[5] = &lwe_dimension;
kernel_args[6] = &polynomial_size;
kernel_args[7] = &base_log;
kernel_args[8] = &level_count;
kernel_args[9] = &d_mem;
if (max_shared_memory < SM_PART) {
kernel_args[11] = &DM_FULL;
kernel_args[10] = &DM_FULL;
check_cuda_error(cudaGetLastError());
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
level_count * 2,
level_count * (glwe_dimension + 1),
stream, gpu_index);
check_cuda_error(cudaGetLastError());
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
(void **)kernel_args, 0, *stream));
} else if (max_shared_memory < SM_FULL) {
kernel_args[11] = &DM_PART;
kernel_args[10] = &DM_PART;
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
level_count * 2,
level_count * (glwe_dimension + 1),
stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
@@ -322,7 +320,7 @@ __host__ void host_bootstrap_low_latency(
} else {
int DM_NONE = 0;
kernel_args[11] = &DM_NONE;
kernel_args[10] = &DM_NONE;
d_mem = (char *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
device_bootstrap_low_latency<Torus, params, FULLSM>,
@@ -337,8 +335,7 @@ __host__ void host_bootstrap_low_latency(
check_cuda_error(cudaGetLastError());
// Synchronize the streams before copying the result to lwe_array_out at the
// right place
cuda_drop_async(mask_buffer_fft, stream, gpu_index);
cuda_drop_async(body_buffer_fft, stream, gpu_index);
cuda_drop_async(buffer_fft, stream, gpu_index);
cuda_drop_async(d_mem, stream, gpu_index);
}

View File

@@ -13,8 +13,6 @@ void cuda_circuit_bootstrap_32(
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
@@ -137,8 +135,6 @@ void cuda_circuit_bootstrap_64(
uint32_t level_bsk, uint32_t base_log_bsk, uint32_t level_pksk,
uint32_t base_log_pksk, uint32_t level_cbs, uint32_t base_log_cbs,
uint32_t number_of_samples, uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||

View File

@@ -12,6 +12,7 @@
* arbitrary amount of levels by using decompose_and_compress_level().
*
* This class always decomposes the entire set of num_poly polynomials.
* By default, it works on a single polynomial.
*/
#pragma once
template <typename T, class params> class GadgetMatrix {
@@ -28,7 +29,7 @@ private:
public:
__device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
uint32_t num_poly)
uint32_t num_poly = 1)
: base_log(base_log), level_count(level_count), num_poly(num_poly),
state(state) {

View File

@@ -61,11 +61,13 @@ __device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
* Performs acc = 0 if zeroAcc
* takes single buffer and calculates inplace.
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
uint32_t j, bool zeroAcc,
uint32_t num_poly) {
uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
@@ -105,10 +107,12 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
*
* Performs result_acc = acc * (X^ä - 1) - acc
* takes single buffer as input and returns a single rotated buffer
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
T *acc, T *result_acc, uint32_t j, uint32_t num_poly) {
T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
@@ -138,11 +142,13 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
/*
* Receives num_poly concatenated polynomials of type T. For each performs a
* rounding to increase accuracy of the PBS. Calculates inplace.
*
* By default, it works on a single polynomial.
*/
template <typename T, int elems_per_thread, int block_size>
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
int level_count,
uint32_t num_poly) {
uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
@@ -192,20 +198,21 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
}
}
// Extracts the body of a GLWE with dimension glwe_dimension
// Extracts the body of a GLWE.
// k is the offset to find the body element / polynomial in the lwe_array_out /
// accumulator
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
uint32_t glwe_dimension) {
uint32_t k) {
// Set first coefficient of the accumulator as the body of the LWE sample
lwe_array_out[glwe_dimension * params::degree] =
accumulator[glwe_dimension * params::degree];
lwe_array_out[k * params::degree] = accumulator[k * params::degree];
}
// Extracts the mask of a GLWE with dimension glwe_dimension
// Extracts the mask from num_poly polynomials individually
template <typename Torus, class params>
__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
uint32_t glwe_dimension) {
for (int z = 0; z < glwe_dimension; z++) {
uint32_t num_poly = 1) {
for (int z = 0; z < num_poly; z++) {
Torus *lwe_array_out_slice =
(Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
Torus *accumulator_slice =

View File

@@ -17,8 +17,6 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));
@@ -99,8 +97,6 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
// For larger k we will need to adjust the mask size
assert(("Error (GPU Cmux tree): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
"be >= 1 ",
r >= 1));

View File

@@ -35,8 +35,6 @@ void cuda_circuit_bootstrap_vertical_packing_64(
uint32_t base_log_pksk, uint32_t level_count_cbs, uint32_t base_log_cbs,
uint32_t number_of_inputs, uint32_t lut_number,
uint32_t max_shared_memory) {
assert(("Error (GPU circuit bootstrap): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
@@ -149,8 +147,6 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
uint32_t number_of_bits_of_message_including_padding,
uint32_t number_of_bits_to_extract,
uint32_t number_of_inputs, uint32_t max_shared_memory) {
assert(("Error (GPU WOP PBS): glwe_dimension should be equal to 1",
glwe_dimension == 1));
assert(("Error (GPU WOP PBS): polynomial_size should be one of "
"512, 1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||

View File

@@ -174,8 +174,8 @@ __host__ void host_wop_pbs(
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer,
lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, ksk, fourier_bsk,
number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension,
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
number_of_inputs, max_shared_memory);
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
level_count_ksk, number_of_inputs, max_shared_memory);
check_cuda_error(cudaGetLastError());
cuda_drop_async(lut_pbs, stream, gpu_index);
cuda_drop_async(lut_vector_indexes, stream, gpu_index);