mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-10 07:08:03 -05:00
fix(gpu): general fixes and improvements to PBS
- update pbs test parameters to match tfhe-rs' integer tests - refactor mul_ggsw_glwe to make it easier to read - fix the way we accumulate the external product result on multi-bit PBS
This commit is contained in:
@@ -106,7 +106,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
uint32_t lwe_chunk_size;
|
||||
double2 *keybundle_fft;
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
double2 *global_join_buffer;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
@@ -225,10 +225,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
global_accumulator = (Torus *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
|
||||
gpu_index);
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
|
||||
input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
|
||||
sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
global_join_buffer = (double2 *)cuda_malloc_async(
|
||||
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
}
|
||||
}
|
||||
@@ -260,7 +262,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
|
||||
|
||||
cuda_drop_async(keybundle_fft, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
|
||||
cuda_drop_async(global_join_buffer, stream, gpu_index);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -69,7 +69,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
int8_t *d_mem;
|
||||
|
||||
Torus *global_accumulator;
|
||||
double2 *global_accumulator_fft;
|
||||
double2 *global_join_buffer;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
|
||||
@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
global_join_buffer = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
@@ -147,7 +147,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
global_join_buffer = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
@@ -194,7 +194,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
|
||||
|
||||
global_accumulator_fft = (double2 *)cuda_malloc_async(
|
||||
global_join_buffer = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2),
|
||||
stream, gpu_index);
|
||||
@@ -208,7 +208,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
|
||||
void release(cudaStream_t stream, uint32_t gpu_index) {
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
|
||||
cuda_drop_async(global_join_buffer, stream, gpu_index);
|
||||
|
||||
if (pbs_variant == DEFAULT)
|
||||
cuda_drop_async(global_accumulator, stream, gpu_index);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CNCRT_CRYPTO_CUH
|
||||
#define CNCRT_CRPYTO_CUH
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
|
||||
@@ -21,7 +22,6 @@ private:
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t num_poly;
|
||||
int current_level;
|
||||
T mask_mod_b;
|
||||
T *state;
|
||||
|
||||
@@ -32,7 +32,6 @@ public:
|
||||
state(state) {
|
||||
|
||||
mask_mod_b = (1ll << base_log) - 1ll;
|
||||
current_level = level_count;
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < num_poly * params::opt; i++) {
|
||||
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
|
||||
@@ -52,8 +51,6 @@ public:
|
||||
// Decomposes a single polynomial
|
||||
__device__ void decompose_and_compress_next_polynomial(double2 *result,
|
||||
int j) {
|
||||
if (j == 0)
|
||||
current_level -= 1;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
auto state_slice = state + j * params::degree;
|
||||
@@ -72,8 +69,8 @@ public:
|
||||
res_re -= carry_re << base_log;
|
||||
res_im -= carry_im << base_log;
|
||||
|
||||
result[tid].x = (int32_t)res_re;
|
||||
result[tid].y = (int32_t)res_im;
|
||||
typecast_torus_to_double(res_re, result[tid].x);
|
||||
typecast_torus_to_double(res_im, result[tid].y);
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#ifndef CNCRT_TORUS_CUH
|
||||
#define CNCRT_TORUS_CUH
|
||||
|
||||
#include "device.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "types/int128.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
@@ -43,6 +44,21 @@ __device__ inline void typecast_double_round_to_torus(double x, T &r) {
|
||||
typecast_double_to_torus(round(frac), r);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline void typecast_torus_to_double(T x, double &r);
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_torus_to_double<uint32_t>(uint32_t x,
|
||||
double &r) {
|
||||
r = __int2double_rn(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
|
||||
double &r) {
|
||||
r = __ll2double_rn(x);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t level_count) {
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "helper_multi_gpu.h"
|
||||
#include "pbs/programmable_bootstrap_multibit.h"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
@@ -20,59 +21,43 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
|
||||
double2 *global_memory_buffer, uint32_t polynomial_size,
|
||||
uint32_t glwe_dimension, bool support_dsm);
|
||||
|
||||
template <typename Torus, typename G, class params>
|
||||
/** Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
* each block operating on a single level for mask and body.
|
||||
* Both operands should be at fourier domain
|
||||
*
|
||||
* This function assumes:
|
||||
* - Thread blocks at dimension x relates to the decomposition level.
|
||||
* - Thread blocks at dimension y relates to the glwe dimension.
|
||||
* - polynomial_size / params::opt threads are available per block
|
||||
*/
|
||||
template <typename G, class params>
|
||||
__device__ void
|
||||
mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
int polynomial_size, uint32_t glwe_dimension, int level_count,
|
||||
int iteration, G &group, bool support_dsm = false) {
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
int iteration, G &group,
|
||||
bool support_dsm = false) {
|
||||
const uint32_t polynomial_size = params::degree;
|
||||
const uint32_t glwe_dimension = gridDim.y - 1;
|
||||
const uint32_t level_count = gridDim.x;
|
||||
|
||||
// The first product is used to initialize level_join_buffer
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
|
||||
auto buffer_slice =
|
||||
get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
|
||||
polynomial_size, glwe_dimension, support_dsm);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
group.sync();
|
||||
|
||||
// Continues multiplying fft by every polynomial in that particular bsk level
|
||||
// Each y-block accumulates in a different polynomial at each iteration
|
||||
for (int j = 1; j < (glwe_dimension + 1); j++) {
|
||||
auto bsk_slice = get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
|
||||
glwe_dimension, level_count);
|
||||
for (int j = 0; j < glwe_dimension + 1; j++) {
|
||||
int idx = (j + this_block_rank) % (glwe_dimension + 1);
|
||||
|
||||
auto bsk_poly = bsk_slice + idx * params::degree / 2;
|
||||
auto bsk_poly = bsk_slice + idx * polynomial_size / 2;
|
||||
auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
|
||||
join_buffer, polynomial_size,
|
||||
glwe_dimension, support_dsm);
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
buffer_slice, fft, bsk_poly, j == 0);
|
||||
group.sync();
|
||||
}
|
||||
|
||||
@@ -80,40 +65,16 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
|
||||
// All blocks are synchronized here; after this sync, level_join_buffer has
|
||||
// the values needed from every other block
|
||||
|
||||
auto src_acc =
|
||||
get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
|
||||
polynomial_size, glwe_dimension, support_dsm);
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
for (int l = 0; l < level_count; l++) {
|
||||
auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
|
||||
join_buffer, polynomial_size,
|
||||
glwe_dimension, support_dsm);
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
polynomial_accumulate_in_fourier_domain<params>(fft, cur_src_acc, l == 0);
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -129,18 +129,16 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, grid_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
|
||||
accumulator_fft, block_join_buffer, bootstrapping_key, i, grid);
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator);
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
@@ -148,40 +146,42 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
if (blockIdx.x == 0) {
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -254,7 +254,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
|
||||
int8_t *d_mem = buffer->d_mem;
|
||||
double2 *buffer_fft = buffer->global_accumulator_fft;
|
||||
double2 *buffer_fft = buffer->global_join_buffer;
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
@@ -33,7 +33,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
uint32_t lut_count, uint32_t lut_stride) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
@@ -50,9 +49,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated = (Torus *)selected_memory;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator +
|
||||
(double2 *)accumulator_rotated +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
@@ -71,13 +70,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
Torus *global_accumulator_slice =
|
||||
&global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
|
||||
params::degree];
|
||||
|
||||
const double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
const double2 *keybundle =
|
||||
&keybundle_array[blockIdx.z * keybundle_size_per_input];
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
@@ -87,12 +85,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
|
||||
b_hat, false);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
// Load the accumulator_rotated calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator);
|
||||
global_accumulator_slice, accumulator_rotated);
|
||||
}
|
||||
|
||||
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
|
||||
@@ -100,79 +98,82 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// Decompose the accumulator_rotated. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
// accumulator_rotated decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, grid_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, keybundle,
|
||||
polynomial_size, glwe_dimension, level_count, i, grid);
|
||||
|
||||
mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
|
||||
accumulator_fft, block_join_buffer, keybundle, i, grid);
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
|
||||
}
|
||||
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
auto accumulator = accumulator_rotated;
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
// Always extract one by default
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (blockIdx.x == 0) {
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra
|
||||
// blocks, in case they're not synchronized Always extract one by
|
||||
// default
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_accumulator_slice);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_slice);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -295,15 +296,18 @@ __host__ void execute_cg_external_product_loop(
|
||||
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint64_t full_dm =
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_dm =
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
auto full_dm = full_sm;
|
||||
auto partial_dm = full_sm - partial_sm;
|
||||
uint64_t no_dm = 0;
|
||||
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
@@ -313,13 +317,11 @@ __host__ void execute_cg_external_product_loop(
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
if (chunk_size == 0)
|
||||
return;
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_cg;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
auto global_accumulator = buffer->global_accumulator;
|
||||
auto buffer_fft = buffer->global_accumulator_fft;
|
||||
auto join_buffer = buffer->global_join_buffer;
|
||||
|
||||
void *kernel_args[22];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
@@ -329,7 +331,7 @@ __host__ void execute_cg_external_product_loop(
|
||||
kernel_args[4] = &lwe_array_in;
|
||||
kernel_args[5] = &lwe_input_indexes;
|
||||
kernel_args[6] = &keybundle_fft;
|
||||
kernel_args[7] = &buffer_fft;
|
||||
kernel_args[7] = &join_buffer;
|
||||
kernel_args[8] = &global_accumulator;
|
||||
kernel_args[9] = &lwe_dimension;
|
||||
kernel_args[10] = &glwe_dimension;
|
||||
@@ -358,13 +360,13 @@ __host__ void execute_cg_external_product_loop(
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, PARTIALSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, partial_dm, stream));
|
||||
grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
|
||||
} else {
|
||||
kernel_args[19] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, FULLSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, full_dm, stream));
|
||||
grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
@@ -67,10 +67,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
double2 *global_fft_slice =
|
||||
global_accumulator_fft +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
|
||||
blockIdx.z * level_count * (glwe_dimension + 1)) *
|
||||
(polynomial_size / 2);
|
||||
global_join_buffer + (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
|
||||
blockIdx.z * level_count * (glwe_dimension + 1)) *
|
||||
(polynomial_size / 2);
|
||||
|
||||
if (lwe_iteration == 0) {
|
||||
// First iteration
|
||||
@@ -139,7 +138,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
@@ -171,9 +170,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
for (int level = 0; level < level_count; level++) {
|
||||
double2 *global_fft_slice = global_accumulator_fft +
|
||||
(level + blockIdx.x * level_count) *
|
||||
(glwe_dimension + 1) * (params::degree / 2);
|
||||
double2 *global_fft_slice =
|
||||
global_join_buffer + (level + blockIdx.x * level_count) *
|
||||
(glwe_dimension + 1) * (params::degree / 2);
|
||||
|
||||
for (int j = 0; j < (glwe_dimension + 1); j++) {
|
||||
double2 *fft = global_fft_slice + j * params::degree / 2;
|
||||
@@ -292,7 +291,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
}
|
||||
// Otherwise, both kernels run all in shared memory
|
||||
uint64_t buffer_size = device_mem +
|
||||
// global_accumulator_fft
|
||||
// global_join_buffer
|
||||
(glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count *
|
||||
(polynomial_size / 2) * sizeof(double2) +
|
||||
@@ -368,7 +367,7 @@ __host__ void execute_step_one(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
@@ -383,21 +382,21 @@ __host__ void execute_step_one(
|
||||
device_programmable_bootstrap_step_one<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0);
|
||||
}
|
||||
@@ -409,7 +408,7 @@ __host__ void execute_step_two(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, double2 const *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
|
||||
@@ -425,21 +424,21 @@ __host__ void execute_step_two(
|
||||
device_programmable_bootstrap_step_two<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm, lut_count, lut_stride);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm, lut_count, lut_stride);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_two<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
bootstrapping_key, global_accumulator, global_join_buffer,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0, lut_count, lut_stride);
|
||||
}
|
||||
@@ -478,20 +477,20 @@ __host__ void host_programmable_bootstrap(
|
||||
uint64_t full_dm_step_two = full_sm_step_two;
|
||||
|
||||
Torus *global_accumulator = pbs_buffer->global_accumulator;
|
||||
double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
|
||||
double2 *global_join_buffer = pbs_buffer->global_join_buffer;
|
||||
int8_t *d_mem = pbs_buffer->d_mem;
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
execute_step_one<Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
execute_step_two<Torus, params>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
|
||||
lut_count, lut_stride);
|
||||
|
||||
@@ -50,7 +50,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
uint64_t device_memory_size_per_block) {
|
||||
|
||||
extern __shared__ int8_t sharedmem[];
|
||||
int8_t *selected_memory = sharedmem;
|
||||
int8_t *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM) {
|
||||
selected_memory = sharedmem;
|
||||
@@ -190,14 +190,14 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
(glwe_dimension + 1)];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
&global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
|
||||
params::degree];
|
||||
|
||||
double2 *global_fft_slice =
|
||||
global_accumulator_fft +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
|
||||
blockIdx.z * level_count * (glwe_dimension + 1)) *
|
||||
(polynomial_size / 2);
|
||||
&global_accumulator_fft[(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
|
||||
blockIdx.z * level_count *
|
||||
(glwe_dimension + 1)) *
|
||||
(polynomial_size / 2)];
|
||||
|
||||
if (lwe_iteration == 0) {
|
||||
// First iteration
|
||||
@@ -249,8 +249,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_two(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
|
||||
double2 *global_accumulator_fft, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
double2 *join_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
|
||||
uint32_t lwe_chunk_size, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t lut_count,
|
||||
@@ -274,30 +274,29 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
double2 *accumulator_fft = (double2 *)selected_memory;
|
||||
|
||||
//
|
||||
const double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.x * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2);
|
||||
const double2 *keybundle =
|
||||
&keybundle_array[blockIdx.x * lwe_chunk_size * level_count *
|
||||
(glwe_dimension + 1) * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2)];
|
||||
|
||||
double2 *global_accumulator_fft_input =
|
||||
global_accumulator_fft +
|
||||
blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
|
||||
double2 *join_buffer_slice =
|
||||
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2)];
|
||||
|
||||
for (int level = 0; level < level_count; level++) {
|
||||
double2 *global_fft_slice =
|
||||
global_accumulator_fft_input +
|
||||
level * (glwe_dimension + 1) * (polynomial_size / 2);
|
||||
&join_buffer_slice[level * (glwe_dimension + 1) *
|
||||
(polynomial_size / 2)];
|
||||
|
||||
for (int j = 0; j < (glwe_dimension + 1); j++) {
|
||||
double2 *fft = global_fft_slice + j * params::degree / 2;
|
||||
double2 *fft = &global_fft_slice[j * params::degree / 2];
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_slice =
|
||||
get_ith_mask_kth_block(keybundle, iteration, j, level,
|
||||
polynomial_size, glwe_dimension, level_count);
|
||||
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
|
||||
auto bsk_poly = &bsk_slice[blockIdx.y * params::degree / 2];
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
accumulator_fft, fft, bsk_poly, !level && !j);
|
||||
@@ -308,8 +307,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
|
||||
&global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
|
||||
params::degree];
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
|
||||
synchronize_threads_in_block();
|
||||
@@ -499,8 +498,6 @@ __host__ void execute_compute_keybundle(
|
||||
auto lwe_chunk_size = buffer->lwe_chunk_size;
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
if (chunk_size == 0)
|
||||
return;
|
||||
|
||||
uint32_t keybundle_size_per_input =
|
||||
lwe_chunk_size * level_count * (glwe_dimension + 1) *
|
||||
@@ -559,7 +556,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
|
||||
//
|
||||
auto d_mem = buffer->d_mem_acc_step_one;
|
||||
auto global_accumulator = buffer->global_accumulator;
|
||||
auto global_accumulator_fft = buffer->global_accumulator_fft;
|
||||
auto global_accumulator_fft = buffer->global_join_buffer;
|
||||
|
||||
dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
@@ -611,7 +608,7 @@ __host__ void execute_step_two(
|
||||
auto d_mem = buffer->d_mem_acc_step_two;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
auto global_accumulator = buffer->global_accumulator;
|
||||
auto global_accumulator_fft = buffer->global_accumulator_fft;
|
||||
auto global_accumulator_fft = buffer->global_join_buffer;
|
||||
|
||||
dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
@@ -133,18 +133,17 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, cluster_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
|
||||
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
|
||||
|
||||
mul_ggsw_glwe_in_fourier_domain<cluster_group, params>(
|
||||
accumulator_fft, block_join_buffer, bootstrapping_key, i, cluster,
|
||||
support_dsm);
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator);
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
@@ -152,42 +151,44 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (blockIdx.x == 0) {
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -287,7 +288,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
uint64_t partial_dm = full_dm - partial_sm;
|
||||
|
||||
int8_t *d_mem = buffer->d_mem;
|
||||
double2 *buffer_fft = buffer->global_accumulator_fft;
|
||||
double2 *buffer_fft = buffer->global_join_buffer;
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
@@ -54,9 +54,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated = (Torus *)selected_memory;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator +
|
||||
(double2 *)accumulator_rotated +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
|
||||
|
||||
if constexpr (SMD == PARTIALSM) {
|
||||
@@ -78,13 +78,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
|
||||
Torus *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
|
||||
Torus *global_accumulator_slice =
|
||||
&global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
|
||||
params::degree];
|
||||
|
||||
const double2 *keybundle = keybundle_array +
|
||||
// select the input
|
||||
blockIdx.z * keybundle_size_per_input;
|
||||
const double2 *keybundle =
|
||||
&keybundle_array[blockIdx.z * keybundle_size_per_input];
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
@@ -94,12 +93,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
|
||||
b_hat, false);
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator);
|
||||
global_accumulator_slice, accumulator_rotated);
|
||||
}
|
||||
|
||||
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
|
||||
@@ -107,75 +106,78 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform G^-1(ACC) * GGSW -> GLWE
|
||||
mul_ggsw_glwe<Torus, cluster_group, params>(
|
||||
accumulator, accumulator_fft, block_join_buffer, keybundle,
|
||||
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
|
||||
|
||||
mul_ggsw_glwe_in_fourier_domain<cluster_group, params>(
|
||||
accumulator_fft, block_join_buffer, keybundle, i, cluster, support_dsm);
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
|
||||
}
|
||||
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
auto accumulator = accumulator_rotated;
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
if (blockIdx.x == 0) {
|
||||
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
|
||||
auto block_lwe_array_out =
|
||||
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
if (blockIdx.y < glwe_dimension) {
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra
|
||||
// blocks, in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 1, i * lut_stride);
|
||||
}
|
||||
}
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
if (lut_count > 1) {
|
||||
for (int i = 1; i < lut_count; i++) {
|
||||
|
||||
auto next_lwe_array_out =
|
||||
lwe_array_out +
|
||||
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
|
||||
auto next_block_lwe_array_out =
|
||||
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
|
||||
(glwe_dimension * polynomial_size + 1) +
|
||||
blockIdx.y * polynomial_size];
|
||||
|
||||
sample_extract_body<Torus, params>(next_block_lwe_array_out,
|
||||
accumulator, 0, i * lut_stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_accumulator_slice);
|
||||
}
|
||||
} else {
|
||||
// Load the accumulator calculated in previous iterations
|
||||
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, global_slice);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -326,13 +328,11 @@ __host__ void execute_tbc_external_product_loop(
|
||||
|
||||
uint32_t chunk_size =
|
||||
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
|
||||
if (chunk_size == 0)
|
||||
return;
|
||||
|
||||
auto d_mem = buffer->d_mem_acc_tbc;
|
||||
auto keybundle_fft = buffer->keybundle_fft;
|
||||
auto global_accumulator = buffer->global_accumulator;
|
||||
auto buffer_fft = buffer->global_accumulator_fft;
|
||||
auto buffer_fft = buffer->global_join_buffer;
|
||||
|
||||
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
@@ -56,8 +56,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
|
||||
bool zeroAcc, uint32_t num_poly = 1) {
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int z = 0; z < num_poly; z++) {
|
||||
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
|
||||
const T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
|
||||
T *accumulator_slice = &accumulator[z * degree];
|
||||
const T *input_slice = &input[z * degree];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
if (zeroAcc) {
|
||||
@@ -66,9 +66,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
|
||||
tid += block_size;
|
||||
}
|
||||
} else {
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (j < degree) {
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
// if (tid < degree - j)
|
||||
// accumulator_slice[tid] = input_slice[tid + j];
|
||||
// else
|
||||
@@ -76,8 +75,11 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
|
||||
int x = tid + j - SEL(degree, 0, tid < degree - j);
|
||||
accumulator_slice[tid] =
|
||||
SEL(-1, 1, tid < degree - j) * input_slice[x];
|
||||
} else {
|
||||
int32_t jj = j - degree;
|
||||
tid += block_size;
|
||||
}
|
||||
} else {
|
||||
int32_t jj = j - degree;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
// if (tid < degree - jj)
|
||||
// accumulator_slice[tid] = -input_slice[tid + jj];
|
||||
// else
|
||||
@@ -85,8 +87,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
|
||||
int x = tid + jj - SEL(degree, 0, tid < degree - jj);
|
||||
accumulator_slice[tid] =
|
||||
SEL(1, -1, tid < degree - jj) * input_slice[x];
|
||||
tid += block_size;
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -160,9 +162,13 @@ __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* In case of classical PBS, this method should accumulate the result.
|
||||
* In case of multi-bit PBS, it should overwrite.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__device__ void add_to_torus(double2 *m_values, Torus *result,
|
||||
bool init_torus = false) {
|
||||
bool overwrite_result = false) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
@@ -175,7 +181,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
|
||||
Torus torus_imag = 0;
|
||||
typecast_double_round_to_torus<Torus>(double_imag, torus_imag);
|
||||
|
||||
if (init_torus) {
|
||||
if (overwrite_result) {
|
||||
result[tid] = torus_real;
|
||||
result[tid + params::degree / 2] = torus_imag;
|
||||
} else {
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "parameters.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
|
||||
@@ -55,6 +56,27 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
}
|
||||
}
|
||||
|
||||
// Computes result += x
|
||||
// If init_accumulator is set, assumes that result was not initialized and does
|
||||
// that with the outcome of first * second
|
||||
template <class params>
|
||||
__device__ void
|
||||
polynomial_accumulate_in_fourier_domain(double2 *result, double2 *x,
|
||||
bool init_accumulator = false) {
|
||||
auto tid = threadIdx.x;
|
||||
if (init_accumulator) {
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = x[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += x[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This method expects to work with polynomial_size / compression_params::opt
|
||||
// threads in the x-block If init_accumulator is set, assumes that result was
|
||||
// not initialized and does that with the outcome of first * second
|
||||
|
||||
@@ -233,147 +233,15 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
|
||||
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
|
||||
// message_modulus, carry_modulus, number_of_inputs, repetitions,
|
||||
// samples
|
||||
// BOOLEAN_DEFAULT_PARAMETERS
|
||||
// PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
|
||||
2, 2, 2, 40},
|
||||
// BOOLEAN_TFHE_LIB_PARAMETERS
|
||||
887, 1, 2048, new_t_uniform(46), new_t_uniform(17), 22, 1, 4, 4,
|
||||
100, 1, 1},
|
||||
// PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
830, 2, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
|
||||
2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
|
||||
new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
|
||||
1, 2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
|
||||
2, 2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
656, 2, 512,
|
||||
new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
|
||||
new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
|
||||
2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
742, 2, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
|
||||
2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
|
||||
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
745, 1, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
|
||||
2, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
807, 1, 4096,
|
||||
new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
|
||||
2, 1, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
915, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
|
||||
2, 1, 2},
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
864, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
|
||||
2, 1, 2},
|
||||
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
|
||||
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
930, 1, 16384,
|
||||
new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
|
||||
1, 2, 1, 1},
|
||||
|
||||
// BOOLEAN_DEFAULT_PARAMETERS
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
|
||||
2, 100, 2, 40},
|
||||
// BOOLEAN_TFHE_LIB_PARAMETERS
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
830, 2, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
|
||||
100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
|
||||
new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
|
||||
1, 100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
|
||||
2, 100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
656, 2, 512,
|
||||
new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
|
||||
new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
|
||||
100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
742, 2, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
|
||||
100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
|
||||
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
|
||||
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
745, 1, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
|
||||
100, 2, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
807, 1, 4096,
|
||||
new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
|
||||
100, 1, 40},
|
||||
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
915, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
|
||||
100, 1, 2},
|
||||
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
864, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
|
||||
100, 1, 2},
|
||||
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
|
||||
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
|
||||
(ClassicalProgrammableBootstrapTestParams){
|
||||
930, 1, 16384,
|
||||
new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
|
||||
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
|
||||
1, 100, 1, 1});
|
||||
977, 1, 8192, new_gaussian_from_std_dev(3.0144389706858286e-07),
|
||||
new_gaussian_from_std_dev(2.168404344971009e-19), 16, 2, 8, 8, 100,
|
||||
1, 1});
|
||||
std::string printParamName(
|
||||
::testing::TestParamInfo<ClassicalProgrammableBootstrapTestParams> p) {
|
||||
ClassicalProgrammableBootstrapTestParams params = p.param;
|
||||
|
||||
@@ -171,70 +171,44 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
int lwe_dimension;
|
||||
int glwe_dimension;
|
||||
int polynomial_size;
|
||||
DynamicDistribution lwe_noise_distribution;
|
||||
DynamicDistribution glwe_noise_distribution;
|
||||
int pbs_base_log;
|
||||
int pbs_level;
|
||||
int message_modulus;
|
||||
int carry_modulus;
|
||||
int number_of_inputs;
|
||||
int grouping_factor;
|
||||
int repetitions;
|
||||
int samples;
|
||||
*/
|
||||
// Defines for which parameters set the PBS will be tested.
|
||||
// It executes each src for all pairs on phis X qs (Cartesian product)
|
||||
::testing::internal::ParamGenerator<MultiBitProgrammableBootstrapTestParams>
|
||||
multipbs_params_u64 = ::testing::Values(
|
||||
// fast src
|
||||
// PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
|
||||
2, 1, 2, 1, 10},
|
||||
882, 1, 2048, new_t_uniform(46), new_t_uniform(17), 14, 2, 8, 8,
|
||||
100, 3, 1, 1},
|
||||
// PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
|
||||
2, 128, 2, 1, 10},
|
||||
// 4_bits_multi_bit_group_2
|
||||
978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)),
|
||||
new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8,
|
||||
100, 3, 1, 1},
|
||||
// PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 22, 1, 2,
|
||||
2, 1, 2, 1, 10},
|
||||
836, 1, 2048, new_gaussian_from_std_dev((3.433444883863949e-06)),
|
||||
new_gaussian_from_std_dev((2.845267479601915e-15)), 22, 2, 4, 4,
|
||||
100, 2, 1, 1},
|
||||
// PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-15)),
|
||||
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-24)), 22, 1, 2,
|
||||
2, 128, 2, 1, 10},
|
||||
// 4_bits_multi_bit_group_3
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
888, 1, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
1, 3, 1, 10},
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
888, 1, 16384,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
1, 3, 1, 1},
|
||||
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
888, 1, 1024,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
128, 3, 1, 10},
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
888, 1, 2048,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
128, 3, 1, 10},
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
888, 1, 4096,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
128, 3, 1, 10},
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
888, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
128, 3, 1, 1},
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
888, 1, 16384,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
|
||||
128, 3, 1, 1},
|
||||
(MultiBitProgrammableBootstrapTestParams){
|
||||
972, 1, 8192,
|
||||
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
|
||||
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 14, 2, 8, 8,
|
||||
68, 3, 1, 1});
|
||||
978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)),
|
||||
new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8,
|
||||
100, 2, 1, 1});
|
||||
|
||||
std::string printParamName(
|
||||
::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {
|
||||
|
||||
Reference in New Issue
Block a user