fix(gpu): general fixes and improvements to PBS

- update pbs test parameters to match tfhe-rs' integer tests
- refactor mul_ggsw_glwe to make it easier to read
- fix the way we accumulate the external product result on multi-bit PBS
This commit is contained in:
Pedro Alves
2024-11-04 15:12:28 -03:00
committed by Agnès Leroy
parent eac30027e9
commit b041608d25
15 changed files with 406 additions and 561 deletions

View File

@@ -106,7 +106,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
uint32_t lwe_chunk_size;
double2 *keybundle_fft;
Torus *global_accumulator;
double2 *global_accumulator_fft;
double2 *global_join_buffer;
PBS_VARIANT pbs_variant;
@@ -225,10 +225,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
num_blocks_keybundle * (polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
global_accumulator = (Torus *)cuda_malloc_async(
num_blocks_acc_step_one * polynomial_size * sizeof(Torus), stream,
gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
num_blocks_acc_step_one * (polynomial_size / 2) * sizeof(double2),
input_lwe_ciphertext_count * (glwe_dimension + 1) * polynomial_size *
sizeof(Torus),
stream, gpu_index);
global_join_buffer = (double2 *)cuda_malloc_async(
level_count * (glwe_dimension + 1) * input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
}
}
@@ -260,7 +262,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
cuda_drop_async(keybundle_fft, stream, gpu_index);
cuda_drop_async(global_accumulator, stream, gpu_index);
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
cuda_drop_async(global_join_buffer, stream, gpu_index);
}
};

View File

@@ -69,7 +69,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
int8_t *d_mem;
Torus *global_accumulator;
double2 *global_accumulator_fft;
double2 *global_join_buffer;
PBS_VARIANT pbs_variant;
@@ -114,7 +114,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
global_join_buffer = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double2),
stream, gpu_index);
@@ -147,7 +147,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
global_join_buffer = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2),
stream, gpu_index);
@@ -194,7 +194,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
// Otherwise, both kernels run all in shared memory
d_mem = (int8_t *)cuda_malloc_async(device_mem, stream, gpu_index);
global_accumulator_fft = (double2 *)cuda_malloc_async(
global_join_buffer = (double2 *)cuda_malloc_async(
(glwe_dimension + 1) * level_count * input_lwe_ciphertext_count *
polynomial_size / 2 * sizeof(double2),
stream, gpu_index);
@@ -208,7 +208,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
void release(cudaStream_t stream, uint32_t gpu_index) {
cuda_drop_async(d_mem, stream, gpu_index);
cuda_drop_async(global_accumulator_fft, stream, gpu_index);
cuda_drop_async(global_join_buffer, stream, gpu_index);
if (pbs_variant == DEFAULT)
cuda_drop_async(global_accumulator, stream, gpu_index);

View File

@@ -1,6 +1,7 @@
#ifndef CNCRT_CRYPTO_CUH
#define CNCRT_CRPYTO_CUH
#include "crypto/torus.cuh"
#include "device.h"
#include <cstdint>
@@ -21,7 +22,6 @@ private:
uint32_t base_log;
uint32_t mask;
uint32_t num_poly;
int current_level;
T mask_mod_b;
T *state;
@@ -32,7 +32,6 @@ public:
state(state) {
mask_mod_b = (1ll << base_log) - 1ll;
current_level = level_count;
int tid = threadIdx.x;
for (int i = 0; i < num_poly * params::opt; i++) {
state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
@@ -52,8 +51,6 @@ public:
// Decomposes a single polynomial
__device__ void decompose_and_compress_next_polynomial(double2 *result,
int j) {
if (j == 0)
current_level -= 1;
int tid = threadIdx.x;
auto state_slice = state + j * params::degree;
@@ -72,8 +69,8 @@ public:
res_re -= carry_re << base_log;
res_im -= carry_im << base_log;
result[tid].x = (int32_t)res_re;
result[tid].y = (int32_t)res_im;
typecast_torus_to_double(res_re, result[tid].x);
typecast_torus_to_double(res_im, result[tid].y);
tid += params::degree / params::opt;
}

View File

@@ -1,6 +1,7 @@
#ifndef CNCRT_TORUS_CUH
#define CNCRT_TORUS_CUH
#include "device.h"
#include "polynomial/parameters.cuh"
#include "types/int128.cuh"
#include "utils/kernel_dimensions.cuh"
@@ -43,6 +44,21 @@ __device__ inline void typecast_double_round_to_torus(double x, T &r) {
typecast_double_to_torus(round(frac), r);
}
template <typename T>
__device__ inline void typecast_torus_to_double(T x, double &r);
template <>
__device__ inline void typecast_torus_to_double<uint32_t>(uint32_t x,
double &r) {
r = __int2double_rn(x);
}
template <>
__device__ inline void typecast_torus_to_double<uint64_t>(uint64_t x,
double &r) {
r = __ll2double_rn(x);
}
template <typename T>
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
uint32_t level_count) {

View File

@@ -7,6 +7,7 @@
#include "fft/bnsmfft.cuh"
#include "helper_multi_gpu.h"
#include "pbs/programmable_bootstrap_multibit.h"
#include "polynomial/polynomial_math.cuh"
using namespace cooperative_groups;
namespace cg = cooperative_groups;
@@ -20,59 +21,43 @@ get_join_buffer_element(int level_id, int glwe_id, G &group,
double2 *global_memory_buffer, uint32_t polynomial_size,
uint32_t glwe_dimension, bool support_dsm);
template <typename Torus, typename G, class params>
/** Perform the matrix multiplication between the GGSW and the GLWE,
* each block operating on a single level for mask and body.
* Both operands should be at fourier domain
*
* This function assumes:
* - Thread blocks at dimension x relates to the decomposition level.
* - Thread blocks at dimension y relates to the glwe dimension.
* - polynomial_size / params::opt threads are available per block
*/
template <typename G, class params>
__device__ void
mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
const double2 *__restrict__ bootstrapping_key,
int polynomial_size, uint32_t glwe_dimension, int level_count,
int iteration, G &group, bool support_dsm = false) {
// Switch to the FFT space
NSMFFT_direct<HalfDegree<params>>(fft);
synchronize_threads_in_block();
// Get the pieces of the bootstrapping key that will be needed for the
// external product; blockIdx.x is the ID of the block that's executing
// this function, so we end up getting the lines of the bootstrapping key
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
mul_ggsw_glwe_in_fourier_domain(double2 *fft, double2 *join_buffer,
const double2 *__restrict__ bootstrapping_key,
int iteration, G &group,
bool support_dsm = false) {
const uint32_t polynomial_size = params::degree;
const uint32_t glwe_dimension = gridDim.y - 1;
const uint32_t level_count = gridDim.x;
// The first product is used to initialize level_join_buffer
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto this_block_rank = get_this_block_rank<G>(group, support_dsm);
auto buffer_slice =
get_join_buffer_element<G>(blockIdx.x, blockIdx.y, group, join_buffer,
polynomial_size, glwe_dimension, support_dsm);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] = fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
group.sync();
// Continues multiplying fft by every polynomial in that particular bsk level
// Each y-block accumulates in a different polynomial at each iteration
for (int j = 1; j < (glwe_dimension + 1); j++) {
auto bsk_slice = get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
glwe_dimension, level_count);
for (int j = 0; j < glwe_dimension + 1; j++) {
int idx = (j + this_block_rank) % (glwe_dimension + 1);
auto bsk_poly = bsk_slice + idx * params::degree / 2;
auto bsk_poly = bsk_slice + idx * polynomial_size / 2;
auto buffer_slice = get_join_buffer_element<G>(blockIdx.x, idx, group,
join_buffer, polynomial_size,
glwe_dimension, support_dsm);
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
buffer_slice[tid] += fft[tid] * bsk_poly[tid];
tid += params::degree / params::opt;
}
polynomial_product_accumulate_in_fourier_domain<params, double2>(
buffer_slice, fft, bsk_poly, j == 0);
group.sync();
}
@@ -80,40 +65,16 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
// All blocks are synchronized here; after this sync, level_join_buffer has
// the values needed from every other block
auto src_acc =
get_join_buffer_element<G>(0, blockIdx.y, group, join_buffer,
polynomial_size, glwe_dimension, support_dsm);
// copy first product into fft buffer
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] = src_acc[tid];
tid += params::degree / params::opt;
}
synchronize_threads_in_block();
// accumulate rest of the products into fft buffer
for (int l = 1; l < gridDim.x; l++) {
for (int l = 0; l < level_count; l++) {
auto cur_src_acc = get_join_buffer_element<G>(l, blockIdx.y, group,
join_buffer, polynomial_size,
glwe_dimension, support_dsm);
tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
fft[tid] += cur_src_acc[tid];
tid += params::degree / params::opt;
}
polynomial_accumulate_in_fourier_domain<params>(fft, cur_src_acc, l == 0);
}
synchronize_threads_in_block();
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
// accumulator
NSMFFT_inverse<HalfDegree<params>>(fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(fft, accumulator);
__syncthreads();
}
template <typename Torus>

View File

@@ -129,18 +129,16 @@ __global__ void device_programmable_bootstrap_cg(
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, grid_group, params>(
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
polynomial_size, glwe_dimension, level_count, i, grid);
mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
accumulator_fft, block_join_buffer, bootstrapping_key, i, grid);
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator);
}
auto block_lwe_array_out =
@@ -148,40 +146,42 @@ __global__ void device_programmable_bootstrap_cg(
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0) {
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
}
}
}
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
}
@@ -254,7 +254,7 @@ __host__ void host_programmable_bootstrap_cg(
uint64_t partial_dm = full_dm - partial_sm;
int8_t *d_mem = buffer->d_mem;
double2 *buffer_fft = buffer->global_accumulator_fft;
double2 *buffer_fft = buffer->global_join_buffer;
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);

View File

@@ -33,7 +33,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
uint32_t lut_count, uint32_t lut_stride) {
grid_group grid = this_grid();
// We use shared memory for the polynomials that are used often during the
@@ -50,9 +49,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
Torus *accumulator_rotated = (Torus *)selected_memory;
double2 *accumulator_fft =
(double2 *)accumulator +
(double2 *)accumulator_rotated +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
if constexpr (SMD == PARTIALSM)
@@ -71,13 +70,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
Torus *global_accumulator_slice =
&global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
params::degree];
const double2 *keybundle = keybundle_array +
// select the input
blockIdx.z * keybundle_size_per_input;
const double2 *keybundle =
&keybundle_array[blockIdx.z * keybundle_size_per_input];
if (lwe_offset == 0) {
// Put "b" in [0, 2N[
@@ -87,12 +85,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
b_hat, false);
} else {
// Load the accumulator calculated in previous iterations
// Load the accumulator_rotated calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
global_slice, accumulator);
global_accumulator_slice, accumulator_rotated);
}
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
@@ -100,79 +98,82 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator_rotated, base_log, level_count);
// Decompose the accumulator. Each block gets one level of the
// Decompose the accumulator_rotated. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
// accumulator_rotated decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, grid_group, params>(
accumulator, accumulator_fft, block_join_buffer, keybundle,
polynomial_size, glwe_dimension, level_count, i, grid);
mul_ggsw_glwe_in_fourier_domain<grid_group, params>(
accumulator_fft, block_join_buffer, keybundle, i, grid);
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
}
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
auto accumulator = accumulator_rotated;
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
// Always extract one by default
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (blockIdx.x == 0) {
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra
// blocks, in case they're not synchronized Always extract one by
// default
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
}
}
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
}
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_accumulator_slice);
}
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_slice);
}
}
@@ -295,15 +296,18 @@ __host__ void execute_cg_external_product_loop(
uint32_t level_count, uint32_t lwe_offset, uint32_t lut_count,
uint32_t lut_stride) {
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint64_t full_dm =
uint64_t full_sm =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
uint64_t partial_dm =
uint64_t partial_sm =
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
polynomial_size);
auto full_dm = full_sm;
auto partial_dm = full_sm - partial_sm;
uint64_t no_dm = 0;
auto lwe_chunk_size = buffer->lwe_chunk_size;
int max_shared_memory = cuda_get_max_shared_memory(0);
cudaSetDevice(gpu_index);
@@ -313,13 +317,11 @@ __host__ void execute_cg_external_product_loop(
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;
auto d_mem = buffer->d_mem_acc_cg;
auto keybundle_fft = buffer->keybundle_fft;
auto global_accumulator = buffer->global_accumulator;
auto buffer_fft = buffer->global_accumulator_fft;
auto join_buffer = buffer->global_join_buffer;
void *kernel_args[22];
kernel_args[0] = &lwe_array_out;
@@ -329,7 +331,7 @@ __host__ void execute_cg_external_product_loop(
kernel_args[4] = &lwe_array_in;
kernel_args[5] = &lwe_input_indexes;
kernel_args[6] = &keybundle_fft;
kernel_args[7] = &buffer_fft;
kernel_args[7] = &join_buffer;
kernel_args[8] = &global_accumulator;
kernel_args[9] = &lwe_dimension;
kernel_args[10] = &glwe_dimension;
@@ -358,13 +360,13 @@ __host__ void execute_cg_external_product_loop(
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, PARTIALSM>,
grid_accumulate, thds, (void **)kernel_args, partial_dm, stream));
grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
} else {
kernel_args[19] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, FULLSM>,
grid_accumulate, thds, (void **)kernel_args, full_dm, stream));
grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
}
}

View File

@@ -25,7 +25,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
Torus *global_accumulator, double2 *global_join_buffer,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {
@@ -67,10 +67,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
double2 *global_fft_slice =
global_accumulator_fft +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
blockIdx.z * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2);
global_join_buffer + (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
blockIdx.z * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2);
if (lwe_iteration == 0) {
// First iteration
@@ -139,7 +138,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const double2 *__restrict__ bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
Torus *global_accumulator, double2 *global_join_buffer,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block,
@@ -171,9 +170,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
accumulator_fft = (double2 *)sharedmem;
for (int level = 0; level < level_count; level++) {
double2 *global_fft_slice = global_accumulator_fft +
(level + blockIdx.x * level_count) *
(glwe_dimension + 1) * (params::degree / 2);
double2 *global_fft_slice =
global_join_buffer + (level + blockIdx.x * level_count) *
(glwe_dimension + 1) * (params::degree / 2);
for (int j = 0; j < (glwe_dimension + 1); j++) {
double2 *fft = global_fft_slice + j * params::degree / 2;
@@ -292,7 +291,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
}
// Otherwise, both kernels run all in shared memory
uint64_t buffer_size = device_mem +
// global_accumulator_fft
// global_join_buffer
(glwe_dimension + 1) * level_count *
input_lwe_ciphertext_count *
(polynomial_size / 2) * sizeof(double2) +
@@ -368,7 +367,7 @@ __host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
Torus *global_accumulator, double2 *global_join_buffer,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
@@ -383,21 +382,21 @@ __host__ void execute_step_one(
device_programmable_bootstrap_step_one<Torus, params, NOSM>
<<<grid, thds, 0, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm);
} else {
device_programmable_bootstrap_step_one<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0);
}
@@ -409,7 +408,7 @@ __host__ void execute_step_two(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, double2 const *bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
Torus *global_accumulator, double2 *global_join_buffer,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *d_mem, int lwe_iteration, uint64_t partial_sm,
@@ -425,21 +424,21 @@ __host__ void execute_step_two(
device_programmable_bootstrap_step_two<Torus, params, NOSM>
<<<grid, thds, 0, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, full_dm, lut_count, lut_stride);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
<<<grid, thds, partial_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, partial_dm, lut_count, lut_stride);
} else {
device_programmable_bootstrap_step_two<Torus, params, FULLSM>
<<<grid, thds, full_sm, stream>>>(
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
bootstrapping_key, global_accumulator, global_accumulator_fft,
bootstrapping_key, global_accumulator, global_join_buffer,
lwe_iteration, lwe_dimension, polynomial_size, base_log,
level_count, d_mem, 0, lut_count, lut_stride);
}
@@ -478,20 +477,20 @@ __host__ void host_programmable_bootstrap(
uint64_t full_dm_step_two = full_sm_step_two;
Torus *global_accumulator = pbs_buffer->global_accumulator;
double2 *global_accumulator_fft = pbs_buffer->global_accumulator_fft;
double2 *global_join_buffer = pbs_buffer->global_join_buffer;
int8_t *d_mem = pbs_buffer->d_mem;
for (int i = 0; i < lwe_dimension; i++) {
execute_step_one<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
execute_step_two<Torus, params>(
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, bootstrapping_key, global_accumulator,
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two,
lut_count, lut_stride);

View File

@@ -50,7 +50,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
uint64_t device_memory_size_per_block) {
extern __shared__ int8_t sharedmem[];
int8_t *selected_memory = sharedmem;
int8_t *selected_memory;
if constexpr (SMD == FULLSM) {
selected_memory = sharedmem;
@@ -190,14 +190,14 @@ __global__ void __launch_bounds__(params::degree / params::opt)
(glwe_dimension + 1)];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
&global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
params::degree];
double2 *global_fft_slice =
global_accumulator_fft +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
blockIdx.z * level_count * (glwe_dimension + 1)) *
(polynomial_size / 2);
&global_accumulator_fft[(blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
blockIdx.z * level_count *
(glwe_dimension + 1)) *
(polynomial_size / 2)];
if (lwe_iteration == 0) {
// First iteration
@@ -249,8 +249,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_accumulate_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
double2 *join_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
uint32_t lwe_chunk_size, int8_t *device_mem,
uint64_t device_memory_size_per_block, uint32_t lut_count,
@@ -274,30 +274,29 @@ __global__ void __launch_bounds__(params::degree / params::opt)
double2 *accumulator_fft = (double2 *)selected_memory;
//
const double2 *keybundle = keybundle_array +
// select the input
blockIdx.x * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2);
const double2 *keybundle =
&keybundle_array[blockIdx.x * lwe_chunk_size * level_count *
(glwe_dimension + 1) * (glwe_dimension + 1) *
(polynomial_size / 2)];
double2 *global_accumulator_fft_input =
global_accumulator_fft +
blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
double2 *join_buffer_slice =
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
(polynomial_size / 2)];
for (int level = 0; level < level_count; level++) {
double2 *global_fft_slice =
global_accumulator_fft_input +
level * (glwe_dimension + 1) * (polynomial_size / 2);
&join_buffer_slice[level * (glwe_dimension + 1) *
(polynomial_size / 2)];
for (int j = 0; j < (glwe_dimension + 1); j++) {
double2 *fft = global_fft_slice + j * params::degree / 2;
double2 *fft = &global_fft_slice[j * params::degree / 2];
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_slice =
get_ith_mask_kth_block(keybundle, iteration, j, level,
polynomial_size, glwe_dimension, level_count);
auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
auto bsk_poly = &bsk_slice[blockIdx.y * params::degree / 2];
polynomial_product_accumulate_in_fourier_domain<params, double2>(
accumulator_fft, fft, bsk_poly, !level && !j);
@@ -308,8 +307,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// accumulator
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
&global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
params::degree];
add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
synchronize_threads_in_block();
@@ -499,8 +498,6 @@ __host__ void execute_compute_keybundle(
auto lwe_chunk_size = buffer->lwe_chunk_size;
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;
uint32_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
@@ -559,7 +556,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index,
//
auto d_mem = buffer->d_mem_acc_step_one;
auto global_accumulator = buffer->global_accumulator;
auto global_accumulator_fft = buffer->global_accumulator_fft;
auto global_accumulator_fft = buffer->global_join_buffer;
dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -611,7 +608,7 @@ __host__ void execute_step_two(
auto d_mem = buffer->d_mem_acc_step_two;
auto keybundle_fft = buffer->keybundle_fft;
auto global_accumulator = buffer->global_accumulator;
auto global_accumulator_fft = buffer->global_accumulator_fft;
auto global_accumulator_fft = buffer->global_join_buffer;
dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
dim3 thds(polynomial_size / params::opt, 1, 1);

View File

@@ -133,18 +133,17 @@ __global__ void device_programmable_bootstrap_tbc(
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, cluster_group, params>(
accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
mul_ggsw_glwe_in_fourier_domain<cluster_group, params>(
accumulator_fft, block_join_buffer, bootstrapping_key, i, cluster,
support_dsm);
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator);
}
auto block_lwe_array_out =
@@ -152,42 +151,44 @@ __global__ void device_programmable_bootstrap_tbc(
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result, but
// we do the computation at block 0 to avoid waiting for extra blocks, in
// case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (blockIdx.x == 0) {
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
}
}
}
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
}
@@ -287,7 +288,7 @@ __host__ void host_programmable_bootstrap_tbc(
uint64_t partial_dm = full_dm - partial_sm;
int8_t *d_mem = buffer->d_mem;
double2 *buffer_fft = buffer->global_accumulator_fft;
double2 *buffer_fft = buffer->global_join_buffer;
int thds = polynomial_size / params::opt;
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);

View File

@@ -54,9 +54,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
selected_memory = &device_mem[block_index * device_memory_size_per_block];
}
Torus *accumulator = (Torus *)selected_memory;
Torus *accumulator_rotated = (Torus *)selected_memory;
double2 *accumulator_fft =
(double2 *)accumulator +
(double2 *)accumulator_rotated +
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
if constexpr (SMD == PARTIALSM) {
@@ -78,13 +78,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
params::degree / 2];
Torus *global_slice =
global_accumulator +
(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
Torus *global_accumulator_slice =
&global_accumulator[(blockIdx.y + blockIdx.z * (glwe_dimension + 1)) *
params::degree];
const double2 *keybundle = keybundle_array +
// select the input
blockIdx.z * keybundle_size_per_input;
const double2 *keybundle =
&keybundle_array[blockIdx.z * keybundle_size_per_input];
if (lwe_offset == 0) {
// Put "b" in [0, 2N[
@@ -94,12 +93,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
accumulator_rotated, &block_lut_vector[blockIdx.y * params::degree],
b_hat, false);
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
global_slice, accumulator);
global_accumulator_slice, accumulator_rotated);
}
for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
@@ -107,75 +106,78 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// bootstrapped ciphertext
round_to_closest_multiple_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, base_log, level_count);
accumulator_rotated, base_log, level_count);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
accumulator_rotated);
gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
// We are using the same memory space for accumulator_fft and
// accumulator_rotated, so we need to synchronize here to make sure they
// don't modify the same memory space at the same time
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
// Perform G^-1(ACC) * GGSW -> GLWE
mul_ggsw_glwe<Torus, cluster_group, params>(
accumulator, accumulator_fft, block_join_buffer, keybundle,
polynomial_size, glwe_dimension, level_count, i, cluster, support_dsm);
mul_ggsw_glwe_in_fourier_domain<cluster_group, params>(
accumulator_fft, block_join_buffer, keybundle, i, cluster, support_dsm);
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
synchronize_threads_in_block();
add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
}
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
auto accumulator = accumulator_rotated;
if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra blocks,
// in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
if (blockIdx.x == 0) {
if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
auto block_lwe_array_out =
&lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
if (blockIdx.y < glwe_dimension) {
// Perform a sample extract. At this point, all blocks have the result,
// but we do the computation at block 0 to avoid waiting for extra
// blocks, in case they're not synchronized
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
}
}
} else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_mask<Torus, params>(next_block_lwe_array_out,
accumulator, 1, i * lut_stride);
}
}
} else if (blockIdx.y == glwe_dimension) {
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
if (lut_count > 1) {
for (int i = 1; i < lut_count; i++) {
auto next_lwe_array_out =
lwe_array_out +
(i * gridDim.z * (glwe_dimension * polynomial_size + 1));
auto next_block_lwe_array_out =
&next_lwe_array_out[lwe_output_indexes[blockIdx.z] *
(glwe_dimension * polynomial_size + 1) +
blockIdx.y * polynomial_size];
sample_extract_body<Torus, params>(next_block_lwe_array_out,
accumulator, 0, i * lut_stride);
}
}
}
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_accumulator_slice);
}
} else {
// Load the accumulator calculated in previous iterations
copy_polynomial<Torus, params::opt, params::degree / params::opt>(
accumulator, global_slice);
}
}
@@ -326,13 +328,11 @@ __host__ void execute_tbc_external_product_loop(
uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;
auto d_mem = buffer->d_mem_acc_tbc;
auto keybundle_fft = buffer->keybundle_fft;
auto global_accumulator = buffer->global_accumulator;
auto buffer_fft = buffer->global_accumulator_fft;
auto buffer_fft = buffer->global_join_buffer;
dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
dim3 thds(polynomial_size / params::opt, 1, 1);

View File

@@ -56,8 +56,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
bool zeroAcc, uint32_t num_poly = 1) {
constexpr int degree = block_size * elems_per_thread;
for (int z = 0; z < num_poly; z++) {
T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
const T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
T *accumulator_slice = &accumulator[z * degree];
const T *input_slice = &input[z * degree];
int tid = threadIdx.x;
if (zeroAcc) {
@@ -66,9 +66,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
tid += block_size;
}
} else {
tid = threadIdx.x;
for (int i = 0; i < elems_per_thread; i++) {
if (j < degree) {
if (j < degree) {
for (int i = 0; i < elems_per_thread; i++) {
// if (tid < degree - j)
// accumulator_slice[tid] = input_slice[tid + j];
// else
@@ -76,8 +75,11 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
int x = tid + j - SEL(degree, 0, tid < degree - j);
accumulator_slice[tid] =
SEL(-1, 1, tid < degree - j) * input_slice[x];
} else {
int32_t jj = j - degree;
tid += block_size;
}
} else {
int32_t jj = j - degree;
for (int i = 0; i < elems_per_thread; i++) {
// if (tid < degree - jj)
// accumulator_slice[tid] = -input_slice[tid + jj];
// else
@@ -85,8 +87,8 @@ divide_by_monomial_negacyclic_inplace(T *accumulator,
int x = tid + jj - SEL(degree, 0, tid < degree - jj);
accumulator_slice[tid] =
SEL(1, -1, tid < degree - jj) * input_slice[x];
tid += block_size;
}
tid += block_size;
}
}
}
@@ -160,9 +162,13 @@ __device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
}
}
/**
* In case of classical PBS, this method should accumulate the result.
* In case of multi-bit PBS, it should overwrite.
*/
template <typename Torus, class params>
__device__ void add_to_torus(double2 *m_values, Torus *result,
bool init_torus = false) {
bool overwrite_result = false) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
@@ -175,7 +181,7 @@ __device__ void add_to_torus(double2 *m_values, Torus *result,
Torus torus_imag = 0;
typecast_double_round_to_torus<Torus>(double_imag, torus_imag);
if (init_torus) {
if (overwrite_result) {
result[tid] = torus_real;
result[tid + params::degree / 2] = torus_imag;
} else {

View File

@@ -3,6 +3,7 @@
#include "crypto/torus.cuh"
#include "parameters.cuh"
#include "types/complex/operations.cuh"
template <typename T>
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
@@ -55,6 +56,27 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
}
}
// Computes result += x
// If init_accumulator is set, assumes that result was not initialized and does
// that with the outcome of first * second
template <class params>
__device__ void
polynomial_accumulate_in_fourier_domain(double2 *result, double2 *x,
bool init_accumulator = false) {
auto tid = threadIdx.x;
if (init_accumulator) {
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = x[tid];
tid += params::degree / params::opt;
}
} else {
for (int i = 0; i < params::opt / 2; i++) {
result[tid] += x[tid];
tid += params::degree / params::opt;
}
}
}
// This method expects to work with polynomial_size / compression_params::opt
// threads in the x-block If init_accumulator is set, assumes that result was
// not initialized and does that with the outcome of first * second

View File

@@ -233,147 +233,15 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
// n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
// message_modulus, carry_modulus, number_of_inputs, repetitions,
// samples
// BOOLEAN_DEFAULT_PARAMETERS
// PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
(ClassicalProgrammableBootstrapTestParams){
777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
2, 2, 2, 40},
// BOOLEAN_TFHE_LIB_PARAMETERS
887, 1, 2048, new_t_uniform(46), new_t_uniform(17), 22, 1, 4, 4,
100, 1, 1},
// PARAM_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
(ClassicalProgrammableBootstrapTestParams){
830, 2, 1024,
new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
2, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
1, 2, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
(ClassicalProgrammableBootstrapTestParams){
684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
2, 2, 2, 40},
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
656, 2, 512,
new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
2, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
742, 2, 1024,
new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
2, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
745, 1, 2048,
new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
2, 2, 40},
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
(ClassicalProgrammableBootstrapTestParams){
807, 1, 4096,
new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
2, 1, 40},
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
915, 1, 8192,
new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
2, 1, 2},
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
(ClassicalProgrammableBootstrapTestParams){
864, 1, 8192,
new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
2, 1, 2},
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
930, 1, 16384,
new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
1, 2, 1, 1},
// BOOLEAN_DEFAULT_PARAMETERS
(ClassicalProgrammableBootstrapTestParams){
777, 3, 512, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
2, 100, 2, 40},
// BOOLEAN_TFHE_LIB_PARAMETERS
(ClassicalProgrammableBootstrapTestParams){
830, 2, 1024,
new_gaussian_from_std_dev(sqrt(1.994564705573226e-12)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 2,
100, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
678, 5, 256, new_gaussian_from_std_dev(sqrt(5.203010004723453e-10)),
new_gaussian_from_std_dev(sqrt(1.3996292326131784e-19)), 15, 1, 2,
1, 100, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_1
(ClassicalProgrammableBootstrapTestParams){
684, 3, 512, new_gaussian_from_std_dev(sqrt(4.177054989616946e-10)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 18, 1, 2,
2, 100, 2, 40},
// SHORTINT_PARAM_MESSAGE_2_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
656, 2, 512,
new_gaussian_from_std_dev(sqrt(1.1641198952558192e-09)),
new_gaussian_from_std_dev(sqrt(1.6434266310406663e-15)), 8, 2, 4, 1,
100, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_2
// SHORTINT_PARAM_MESSAGE_2_CARRY_1
// SHORTINT_PARAM_MESSAGE_3_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
742, 2, 1024,
new_gaussian_from_std_dev(sqrt(4.998277131225527e-11)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 4,
100, 2, 40},
// SHORTINT_PARAM_MESSAGE_1_CARRY_3
// SHORTINT_PARAM_MESSAGE_2_CARRY_2
// SHORTINT_PARAM_MESSAGE_3_CARRY_1
// SHORTINT_PARAM_MESSAGE_4_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
745, 1, 2048,
new_gaussian_from_std_dev(sqrt(4.478453795193731e-11)),
new_gaussian_from_std_dev(sqrt(8.645717832544903e-32)), 23, 1, 2, 8,
100, 2, 40},
// SHORTINT_PARAM_MESSAGE_5_CARRY_0
// SHORTINT_PARAM_MESSAGE_3_CARRY_2
(ClassicalProgrammableBootstrapTestParams){
807, 1, 4096,
new_gaussian_from_std_dev(sqrt(4.629015039118823e-12)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 32, 1,
100, 1, 40},
// SHORTINT_PARAM_MESSAGE_6_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
915, 1, 8192,
new_gaussian_from_std_dev(sqrt(8.883173851180252e-14)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 22, 1, 64, 1,
100, 1, 2},
// SHORTINT_PARAM_MESSAGE_3_CARRY_3
(ClassicalProgrammableBootstrapTestParams){
864, 1, 8192,
new_gaussian_from_std_dev(sqrt(1.5843564961097632e-15)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 8, 8,
100, 1, 2},
// SHORTINT_PARAM_MESSAGE_4_CARRY_3
// SHORTINT_PARAM_MESSAGE_7_CARRY_0
(ClassicalProgrammableBootstrapTestParams){
930, 1, 16384,
new_gaussian_from_std_dev(sqrt(5.129877458078009e-14)),
new_gaussian_from_std_dev(sqrt(4.70197740328915e-38)), 15, 2, 128,
1, 100, 1, 1});
977, 1, 8192, new_gaussian_from_std_dev(3.0144389706858286e-07),
new_gaussian_from_std_dev(2.168404344971009e-19), 16, 2, 8, 8, 100,
1, 1});
std::string printParamName(
::testing::TestParamInfo<ClassicalProgrammableBootstrapTestParams> p) {
ClassicalProgrammableBootstrapTestParams params = p.param;

View File

@@ -171,70 +171,44 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
}
}
/**
int lwe_dimension;
int glwe_dimension;
int polynomial_size;
DynamicDistribution lwe_noise_distribution;
DynamicDistribution glwe_noise_distribution;
int pbs_base_log;
int pbs_level;
int message_modulus;
int carry_modulus;
int number_of_inputs;
int grouping_factor;
int repetitions;
int samples;
*/
// Defines for which parameters set the PBS will be tested.
// It executes each src for all pairs on phis X qs (Cartesian product)
::testing::internal::ParamGenerator<MultiBitProgrammableBootstrapTestParams>
multipbs_params_u64 = ::testing::Values(
// fast src
// PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64
(MultiBitProgrammableBootstrapTestParams){
16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
2, 1, 2, 1, 10},
882, 1, 2048, new_t_uniform(46), new_t_uniform(17), 14, 2, 8, 8,
100, 3, 1, 1},
// PARAM_GPU_MULTI_BIT_GROUP_3_MESSAGE_3_CARRY_3_KS_PBS_GAUSSIAN_2M64
(MultiBitProgrammableBootstrapTestParams){
16, 1, 256, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 23, 1, 2,
2, 128, 2, 1, 10},
// 4_bits_multi_bit_group_2
978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)),
new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8,
100, 3, 1, 1},
// PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64
(MultiBitProgrammableBootstrapTestParams){
818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-11)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-23)), 22, 1, 2,
2, 1, 2, 1, 10},
836, 1, 2048, new_gaussian_from_std_dev((3.433444883863949e-06)),
new_gaussian_from_std_dev((2.845267479601915e-15)), 22, 2, 4, 4,
100, 2, 1, 1},
// PARAM_GPU_MULTI_BIT_GROUP_2_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64
(MultiBitProgrammableBootstrapTestParams){
818, 1, 2048, new_gaussian_from_std_dev(sqrt(1.3880686109937e-15)),
new_gaussian_from_std_dev(sqrt(1.1919984450689246e-24)), 22, 1, 2,
2, 128, 2, 1, 10},
// 4_bits_multi_bit_group_3
(MultiBitProgrammableBootstrapTestParams){
888, 1, 2048,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
1, 3, 1, 10},
(MultiBitProgrammableBootstrapTestParams){
888, 1, 16384,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
1, 3, 1, 1},
(MultiBitProgrammableBootstrapTestParams){
888, 1, 1024,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
128, 3, 1, 10},
(MultiBitProgrammableBootstrapTestParams){
888, 1, 2048,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
128, 3, 1, 10},
(MultiBitProgrammableBootstrapTestParams){
888, 1, 4096,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
128, 3, 1, 10},
(MultiBitProgrammableBootstrapTestParams){
888, 1, 8192,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
128, 3, 1, 1},
(MultiBitProgrammableBootstrapTestParams){
888, 1, 16384,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 21, 1, 2, 2,
128, 3, 1, 1},
(MultiBitProgrammableBootstrapTestParams){
972, 1, 8192,
new_gaussian_from_std_dev(sqrt(4.9571231961752025e-12)),
new_gaussian_from_std_dev(sqrt(9.9409770026944e-32)), 14, 2, 8, 8,
68, 3, 1, 1});
978, 1, 8192, new_gaussian_from_std_dev((2.962875621642539e-07)),
new_gaussian_from_std_dev((2.168404344971009e-19)), 14, 2, 8, 8,
100, 2, 1, 1});
std::string printParamName(
::testing::TestParamInfo<MultiBitProgrammableBootstrapTestParams> p) {