refactor(gpu): remove lwe chunk size argument

This commit is contained in:
Agnes Leroy
2024-08-01 10:35:43 +02:00
committed by Agnès Leroy
parent 5547d92c79
commit 7fa9f33776
9 changed files with 102 additions and 167 deletions

View File

@@ -19,8 +19,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -28,7 +27,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0);
uint32_t level_count, uint32_t num_samples);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
@@ -51,8 +50,7 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template <typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -61,8 +59,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
#endif
template <typename Torus>
@@ -70,15 +67,13 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template <typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -87,16 +82,14 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
template <typename Torus>
void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -105,8 +98,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
template <typename Torus>
__host__ __device__ uint64_t

View File

@@ -176,8 +176,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size = 0) {
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
uint64_t full_sm_keybundle =
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
@@ -242,9 +241,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
@@ -336,12 +334,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
polynomial_size);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, num_samples, polynomial_size);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {

View File

@@ -65,8 +65,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -79,7 +78,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -87,7 +86,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -95,7 +94,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -103,7 +102,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 4096:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -111,7 +110,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 8192:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -119,7 +118,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 16384:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -127,7 +126,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -143,8 +142,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -157,7 +155,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 512:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -165,7 +163,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 1024:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -173,7 +171,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 2048:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -181,7 +179,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 4096:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -189,7 +187,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 8192:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -197,7 +195,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 16384:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -205,7 +203,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -220,7 +218,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size) {
uint32_t level_count, uint32_t num_samples) {
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
@@ -237,7 +235,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
#else
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
@@ -252,7 +250,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -264,7 +262,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
static_cast<uint64_t *>(lwe_input_indexes),
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -275,51 +273,50 @@ template <typename Torus>
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size) {
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, lwe_chunk_size);
allocate_gpu_memory);
break;
case 512:
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, lwe_chunk_size);
allocate_gpu_memory);
break;
case 1024:
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, lwe_chunk_size);
allocate_gpu_memory);
break;
case 2048:
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, lwe_chunk_size);
allocate_gpu_memory);
break;
case 4096:
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, lwe_chunk_size);
allocate_gpu_memory);
break;
case 8192:
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, lwe_chunk_size);
allocate_gpu_memory);
break;
case 16384:
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory, lwe_chunk_size);
allocate_gpu_memory);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -333,58 +330,50 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size) {
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 512:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 1024:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 2048:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 4096:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 8192:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 16384:
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -397,7 +386,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
bool allocate_gpu_memory) {
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -406,8 +395,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory,
lwe_chunk_size);
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
else
#endif
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
@@ -416,13 +404,12 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, lwe_chunk_size);
input_lwe_ciphertext_count, allocate_gpu_memory);
else
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory,
lwe_chunk_size);
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
}
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
@@ -503,7 +490,7 @@ template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
bool allocate_gpu_memory);
template void
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -513,15 +500,13 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template void
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -531,8 +516,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
template bool
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -545,58 +529,50 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size) {
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
switch (polynomial_size) {
case 256:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 512:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 1024:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 2048:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 4096:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 8192:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
case 16384:
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
lwe_chunk_size);
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -611,8 +587,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -625,7 +600,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 512:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -633,7 +608,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 1024:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -641,7 +616,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 2048:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -649,7 +624,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 4096:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -657,7 +632,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 8192:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -665,7 +640,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
case 16384:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -673,7 +648,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, lwe_chunk_size);
num_samples);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -686,8 +661,7 @@ template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t grouping_factor,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
uint32_t lwe_chunk_size);
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template void
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -697,6 +671,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size);
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
#endif

View File

@@ -376,7 +376,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
bool allocate_gpu_memory) {
int max_shared_memory = cuda_get_max_shared_memory(0);
uint64_t full_sm_keybundle =
@@ -469,9 +469,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
@@ -631,13 +630,10 @@ __host__ void host_multi_bit_programmable_bootstrap(
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
// If a chunk size is not passed to this function, select one.
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
polynomial_size);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, num_samples, polynomial_size);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {

View File

@@ -172,7 +172,7 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
bool allocate_gpu_memory) {
bool supports_dsm =
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
@@ -252,9 +252,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size);
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
@@ -364,13 +363,11 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t lwe_chunk_size = 0) {
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
cudaSetDevice(gpu_index);
if (!lwe_chunk_size)
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
polynomial_size);
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
gpu_index, num_samples, polynomial_size);
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
lwe_offset += lwe_chunk_size) {

View File

@@ -14,7 +14,6 @@ typedef struct {
int pbs_level;
int input_lwe_ciphertext_count;
int grouping_factor;
int chunk_size;
} MultiBitPBSBenchmarkParams;
typedef struct {
@@ -56,8 +55,6 @@ protected:
uint64_t *d_lwe_output_indexes;
int8_t *buffer;
int chunk_size;
public:
void SetUp(const ::benchmark::State &state) {
stream = cuda_create_stream(gpu_index);
@@ -69,7 +66,6 @@ public:
pbs_level = state.range(4);
input_lwe_ciphertext_count = state.range(5);
grouping_factor = state.range(6);
chunk_size = state.range(7);
DynamicDistribution lwe_modular_variance =
new_gaussian_from_std_dev(sqrt(0.000007069849454709433));
@@ -182,7 +178,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
input_lwe_ciphertext_count, true, chunk_size);
input_lwe_ciphertext_count, true);
for (auto _ : st) {
// Execute PBS
@@ -191,7 +187,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, input_lwe_ciphertext_count, chunk_size);
pbs_level, input_lwe_ciphertext_count);
cuda_synchronize_stream(stream);
}
@@ -211,7 +207,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true, chunk_size);
true);
for (auto _ : st) {
// Execute PBS
@@ -220,7 +216,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size);
pbs_base_log, pbs_level, input_lwe_ciphertext_count);
cuda_synchronize_stream(stream, gpu_index);
}
@@ -232,7 +228,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
grouping_factor, input_lwe_ciphertext_count, true, chunk_size);
grouping_factor, input_lwe_ciphertext_count, true);
for (auto _ : st) {
// Execute PBS
@@ -241,7 +237,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size);
pbs_base_log, pbs_level, input_lwe_ciphertext_count);
cuda_synchronize_stream(stream, gpu_index);
}
@@ -362,9 +358,9 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
// input_lwe_ciphertext_count
std::vector<MultiBitPBSBenchmarkParams> params = {
// 4_bits_multi_bit_group_2
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2, 0},
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2},
// 4_bits_multi_bit_group_3
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3, 0},
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3},
};
// Add to the list of parameters to benchmark
@@ -373,18 +369,7 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
input_lwe_ciphertext_count *= 2) {
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
x.grouping_factor, 0});
for (int lwe_chunk_size = 1;
lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
lwe_chunk_size *= 2)
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
x.grouping_factor, lwe_chunk_size});
int lwe_chunk_size = x.lwe_dimension / x.grouping_factor;
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
x.grouping_factor, lwe_chunk_size});
x.grouping_factor});
}
}
}
@@ -413,20 +398,20 @@ BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, TbcMultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
"grouping_factor", "chunk_size"});
"grouping_factor"});
#endif
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, CgMultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
"grouping_factor", "chunk_size"});
"grouping_factor"});
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit)
->Apply(MultiBitPBSBenchmarkGenerateParams)
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
"grouping_factor", "chunk_size"});
"grouping_factor"});
#if CUDA_ARCH >= 900
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, TbcPBC)

View File

@@ -135,7 +135,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
(void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
pbs_level, number_of_inputs, 0);
pbs_level, number_of_inputs);
// Copy result to the host memory
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,

View File

@@ -223,7 +223,6 @@ extern "C" {
grouping_factor: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
lwe_chunk_size: u32,
);
/// Perform bootstrapping on a batch of input u64 LWE ciphertexts using the multi-bit algorithm.
@@ -270,7 +269,6 @@ extern "C" {
base_log: u32,
level: u32,
num_samples: u32,
lwe_chunk_size: u32,
);
/// This cleanup function frees the data for the multi-bit PBS on GPU

View File

@@ -182,7 +182,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
grouping_factor.0 as u32,
num_samples,
true,
0u32,
);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
@@ -202,7 +201,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
base_log.0 as u32,
level.0 as u32,
num_samples,
0,
);
cleanup_cuda_multi_bit_programmable_bootstrap(
streams.ptr[0],