mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-10 07:08:03 -05:00
refactor(gpu): remove lwe chunk size argument
This commit is contained in:
@@ -19,8 +19,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t chunk_size = 0);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -28,7 +27,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0);
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
@@ -51,8 +50,7 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -61,8 +59,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
#endif
|
||||
|
||||
template <typename Torus>
|
||||
@@ -70,15 +67,13 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -87,16 +82,14 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -105,8 +98,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size = 0);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ uint64_t
|
||||
|
||||
@@ -176,8 +176,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
@@ -242,9 +241,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
|
||||
@@ -336,12 +334,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
|
||||
polynomial_size);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
@@ -65,8 +65,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
@@ -79,7 +78,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
@@ -87,7 +86,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
@@ -95,7 +94,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
@@ -103,7 +102,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
@@ -111,7 +110,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
@@ -119,7 +118,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
@@ -127,7 +126,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -143,8 +142,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
@@ -157,7 +155,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
@@ -165,7 +163,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
@@ -173,7 +171,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
@@ -181,7 +179,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
@@ -189,7 +187,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
@@ -197,7 +195,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
@@ -205,7 +203,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -220,7 +218,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size) {
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
|
||||
@@ -237,7 +235,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
#else
|
||||
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
|
||||
@@ -252,7 +250,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -264,7 +262,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
@@ -275,51 +273,50 @@ template <typename Torus>
|
||||
void scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, lwe_chunk_size);
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -333,58 +330,50 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -397,7 +386,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
@@ -406,8 +395,7 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
@@ -416,13 +404,12 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
@@ -503,7 +490,7 @@ template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size);
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -513,15 +500,13 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -531,8 +516,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
template bool
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
@@ -545,58 +529,50 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size) {
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
|
||||
lwe_chunk_size);
|
||||
input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -611,8 +587,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
|
||||
@@ -625,7 +600,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
@@ -633,7 +608,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
@@ -641,7 +616,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
@@ -649,7 +624,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
@@ -657,7 +632,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 8192:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
@@ -665,7 +640,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
case 16384:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
@@ -673,7 +648,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, lwe_chunk_size);
|
||||
num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -686,8 +661,7 @@ template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
uint32_t lwe_chunk_size);
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -697,6 +671,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint64_t *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size);
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples);
|
||||
#endif
|
||||
|
||||
@@ -376,7 +376,7 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
uint64_t full_sm_keybundle =
|
||||
@@ -469,9 +469,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
|
||||
@@ -631,13 +630,10 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, Torus *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
// If a chunk size is not passed to this function, select one.
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
|
||||
polynomial_size);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
@@ -172,7 +172,7 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
|
||||
bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
@@ -252,9 +252,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
|
||||
@@ -364,13 +363,11 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
Torus *lwe_array_in, Torus *lwe_input_indexes, uint64_t *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t lwe_chunk_size = 0) {
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
if (!lwe_chunk_size)
|
||||
lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
|
||||
polynomial_size);
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, num_samples, polynomial_size);
|
||||
|
||||
for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
@@ -14,7 +14,6 @@ typedef struct {
|
||||
int pbs_level;
|
||||
int input_lwe_ciphertext_count;
|
||||
int grouping_factor;
|
||||
int chunk_size;
|
||||
} MultiBitPBSBenchmarkParams;
|
||||
|
||||
typedef struct {
|
||||
@@ -56,8 +55,6 @@ protected:
|
||||
uint64_t *d_lwe_output_indexes;
|
||||
int8_t *buffer;
|
||||
|
||||
int chunk_size;
|
||||
|
||||
public:
|
||||
void SetUp(const ::benchmark::State &state) {
|
||||
stream = cuda_create_stream(gpu_index);
|
||||
@@ -69,7 +66,6 @@ public:
|
||||
pbs_level = state.range(4);
|
||||
input_lwe_ciphertext_count = state.range(5);
|
||||
grouping_factor = state.range(6);
|
||||
chunk_size = state.range(7);
|
||||
|
||||
DynamicDistribution lwe_modular_variance =
|
||||
new_gaussian_from_std_dev(sqrt(0.000007069849454709433));
|
||||
@@ -182,7 +178,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, pbs_level, grouping_factor,
|
||||
input_lwe_ciphertext_count, true, chunk_size);
|
||||
input_lwe_ciphertext_count, true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
@@ -191,7 +187,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
|
||||
pbs_level, input_lwe_ciphertext_count, chunk_size);
|
||||
pbs_level, input_lwe_ciphertext_count);
|
||||
cuda_synchronize_stream(stream);
|
||||
}
|
||||
|
||||
@@ -211,7 +207,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
|
||||
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
true, chunk_size);
|
||||
true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
@@ -220,7 +216,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
|
||||
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size);
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -232,7 +228,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
|
||||
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
|
||||
grouping_factor, input_lwe_ciphertext_count, true, chunk_size);
|
||||
grouping_factor, input_lwe_ciphertext_count, true);
|
||||
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
@@ -241,7 +237,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
|
||||
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size);
|
||||
pbs_base_log, pbs_level, input_lwe_ciphertext_count);
|
||||
cuda_synchronize_stream(stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -362,9 +358,9 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
// input_lwe_ciphertext_count
|
||||
std::vector<MultiBitPBSBenchmarkParams> params = {
|
||||
// 4_bits_multi_bit_group_2
|
||||
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2, 0},
|
||||
(MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2},
|
||||
// 4_bits_multi_bit_group_3
|
||||
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3, 0},
|
||||
(MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3},
|
||||
};
|
||||
|
||||
// Add to the list of parameters to benchmark
|
||||
@@ -373,18 +369,7 @@ MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
|
||||
input_lwe_ciphertext_count *= 2) {
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
|
||||
x.grouping_factor, 0});
|
||||
for (int lwe_chunk_size = 1;
|
||||
lwe_chunk_size <= x.lwe_dimension / x.grouping_factor;
|
||||
lwe_chunk_size *= 2)
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
|
||||
x.grouping_factor, lwe_chunk_size});
|
||||
|
||||
int lwe_chunk_size = x.lwe_dimension / x.grouping_factor;
|
||||
b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
|
||||
x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
|
||||
x.grouping_factor, lwe_chunk_size});
|
||||
x.grouping_factor});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -413,20 +398,20 @@ BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
->Apply(MultiBitPBSBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
|
||||
"grouping_factor", "chunk_size"});
|
||||
"grouping_factor"});
|
||||
#endif
|
||||
|
||||
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, CgMultiBit)
|
||||
->Apply(MultiBitPBSBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
|
||||
"grouping_factor", "chunk_size"});
|
||||
"grouping_factor"});
|
||||
|
||||
BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, DefaultMultiBit)
|
||||
->Apply(MultiBitPBSBenchmarkGenerateParams)
|
||||
->ArgNames({"lwe_dimension", "glwe_dimension", "polynomial_size",
|
||||
"pbs_base_log", "pbs_level", "input_lwe_ciphertext_count",
|
||||
"grouping_factor", "chunk_size"});
|
||||
"grouping_factor"});
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
BENCHMARK_REGISTER_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
|
||||
@@ -135,7 +135,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
|
||||
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
|
||||
(void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
|
||||
pbs_level, number_of_inputs, 0);
|
||||
pbs_level, number_of_inputs);
|
||||
|
||||
// Copy result to the host memory
|
||||
cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
|
||||
|
||||
@@ -223,7 +223,6 @@ extern "C" {
|
||||
grouping_factor: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
lwe_chunk_size: u32,
|
||||
);
|
||||
|
||||
/// Perform bootstrapping on a batch of input u64 LWE ciphertexts using the multi-bit algorithm.
|
||||
@@ -270,7 +269,6 @@ extern "C" {
|
||||
base_log: u32,
|
||||
level: u32,
|
||||
num_samples: u32,
|
||||
lwe_chunk_size: u32,
|
||||
);
|
||||
|
||||
/// This cleanup function frees the data for the multi-bit PBS on GPU
|
||||
|
||||
@@ -182,7 +182,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
|
||||
grouping_factor.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
0u32,
|
||||
);
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams.ptr[0],
|
||||
@@ -202,7 +201,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
0,
|
||||
);
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
|
||||
Reference in New Issue
Block a user