mirror of
https://github.com/zama-ai/concrete.git
synced 2026-04-17 03:00:54 -04:00
refactor(cuda): introduce scratch for low latency pbs
This commit is contained in:
46
include/bit_extraction.h
Normal file
46
include/bit_extraction.h
Normal file
@@ -0,0 +1,46 @@
|
||||
#ifndef CUDA_BIT_EXTRACT_H
|
||||
#define CUDA_BIT_EXTRACT_H
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
extern "C" {
|
||||
|
||||
void scratch_cuda_extract_bits_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_extract_bits_64(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
|
||||
void *list_lwe_array_out, void *lwe_array_in,
|
||||
int8_t *bit_extract_buffer, void *ksk,
|
||||
void *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log_bsk,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk,
|
||||
uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
|
||||
void *list_lwe_array_out, void *lwe_array_in,
|
||||
int8_t *bit_extract_buffer, void *ksk,
|
||||
void *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log_bsk,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk,
|
||||
uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cleanup_cuda_extract_bits(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **bit_extract_buffer);
|
||||
}
|
||||
|
||||
#endif // CUDA_BIT_EXTRACT_H
|
||||
@@ -52,41 +52,36 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_bootstrap_low_latency_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_bootstrap_low_latency_64(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
|
||||
void *test_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *test_vector,
|
||||
void *test_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_test_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
|
||||
|
||||
void cuda_extract_bits_32(
|
||||
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
|
||||
void *lwe_array_in, void *lwe_array_in_buffer,
|
||||
void *lwe_array_in_shifted_buffer, void *lwe_array_out_ks_buffer,
|
||||
void *lwe_array_out_pbs_buffer, void *lut_pbs, void *lut_vector_indexes,
|
||||
void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_extract_bits_64(
|
||||
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
|
||||
void *lwe_array_in, void *lwe_array_in_buffer,
|
||||
void *lwe_array_in_shifted_buffer, void *lwe_array_out_ks_buffer,
|
||||
void *lwe_array_out_pbs_buffer, void *lut_pbs, void *lut_vector_indexes,
|
||||
void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory);
|
||||
void cleanup_cuda_bootstrap_low_latency(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
void scratch_cuda_circuit_bootstrap_vertical_packing_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
|
||||
@@ -106,17 +101,19 @@ void scratch_cuda_wop_pbs_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t level_count_bsk,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void scratch_cuda_wop_pbs_64(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t level_count_bsk,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory);
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory);
|
||||
|
||||
void cuda_circuit_bootstrap_vertical_packing_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
|
||||
|
||||
@@ -1,18 +1,112 @@
|
||||
#include "bit_extraction.cuh"
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the bit extraction on 32 bits inputs, into `cbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_extract_bits_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
scratch_extract_bits<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_extract_bits<uint32_t, int32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_extract_bits<uint32_t, int32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_extract_bits<uint32_t, int32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_extract_bits<uint32_t, int32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the bit extraction on 64 bits inputs, into `cbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_extract_bits_64(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t number_of_inputs, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
scratch_extract_bits<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_extract_bits<uint64_t, int64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_extract_bits<uint64_t, int64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_extract_bits<uint64_t, int64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_extract_bits<uint64_t, int64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bit extract on a batch of 32 bit LWE ciphertexts.
|
||||
* See the corresponding function on 64 bit LWE ciphertexts for more details.
|
||||
*/
|
||||
void cuda_extract_bits_32(
|
||||
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
|
||||
void *lwe_array_in, void *lwe_array_in_buffer,
|
||||
void *lwe_array_in_shifted_buffer, void *lwe_array_out_ks_buffer,
|
||||
void *lwe_array_out_pbs_buffer, void *lut_pbs, void *lut_vector_indexes,
|
||||
void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
|
||||
void *list_lwe_array_out, void *lwe_array_in,
|
||||
int8_t *bit_extract_buffer, void *ksk,
|
||||
void *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log_bsk,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk,
|
||||
uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 32",
|
||||
base_log_bsk <= 32));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
|
||||
@@ -20,9 +114,12 @@ void cuda_extract_bits_32(
|
||||
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
|
||||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
|
||||
lwe_dimension_in == 8192));
|
||||
// The number of samples should be lower than 4 time the number of streaming
|
||||
// multiprocessors divided by ((k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
|
||||
"polynomial_size",
|
||||
lwe_dimension_in == polynomial_size));
|
||||
// The number of samples should be lower than four time the number of
|
||||
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
|
||||
// related to the occupancy of 50%). The only supported value for k is 1, so
|
||||
// k + 1 = 2 for now.
|
||||
int number_of_sm = 0;
|
||||
cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
|
||||
@@ -36,62 +133,47 @@ void cuda_extract_bits_32(
|
||||
case 512:
|
||||
host_extract_bits<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer,
|
||||
(uint32_t *)lwe_array_out_ks_buffer,
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_extract_bits<uint32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer,
|
||||
(uint32_t *)lwe_array_out_ks_buffer,
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_extract_bits<uint32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer,
|
||||
(uint32_t *)lwe_array_out_ks_buffer,
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_extract_bits<uint32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer,
|
||||
(uint32_t *)lwe_array_out_ks_buffer,
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_extract_bits<uint32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer,
|
||||
(uint32_t *)lwe_array_out_ks_buffer,
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -143,16 +225,16 @@ void cuda_extract_bits_32(
|
||||
* This function will call corresponding template of wrapper host function which
|
||||
* will manage the calls of device functions.
|
||||
*/
|
||||
void cuda_extract_bits_64(
|
||||
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
|
||||
void *lwe_array_in, void *lwe_array_in_buffer,
|
||||
void *lwe_array_in_shifted_buffer, void *lwe_array_out_ks_buffer,
|
||||
void *lwe_array_out_pbs_buffer, void *lut_pbs, void *lut_vector_indexes,
|
||||
void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
|
||||
void *list_lwe_array_out, void *lwe_array_in,
|
||||
int8_t *bit_extract_buffer, void *ksk,
|
||||
void *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log_bsk,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk,
|
||||
uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 64",
|
||||
base_log_bsk <= 64));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
|
||||
@@ -160,6 +242,9 @@ void cuda_extract_bits_64(
|
||||
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
|
||||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
|
||||
lwe_dimension_in == 8192));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
|
||||
"polynomial_size",
|
||||
lwe_dimension_in == polynomial_size));
|
||||
// The number of samples should be lower than four time the number of
|
||||
// streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
|
||||
// related to the occupancy of 50%). The only supported value for k is 1, so
|
||||
@@ -176,64 +261,60 @@ void cuda_extract_bits_64(
|
||||
case 512:
|
||||
host_extract_bits<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer,
|
||||
(uint64_t *)lwe_array_out_ks_buffer,
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_extract_bits<uint64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer,
|
||||
(uint64_t *)lwe_array_out_ks_buffer,
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_extract_bits<uint64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer,
|
||||
(uint64_t *)lwe_array_out_ks_buffer,
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_extract_bits<uint64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer,
|
||||
(uint64_t *)lwe_array_out_ks_buffer,
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_extract_bits<uint64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer,
|
||||
(uint64_t *)lwe_array_out_ks_buffer,
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_samples, max_shared_memory);
|
||||
(uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
|
||||
(double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
|
||||
lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data for the bit extraction on GPU in
|
||||
* bit_extract_buffer for 32 or 64 bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_extract_bits(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **bit_extract_buffer) {
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
// Free memory
|
||||
cuda_drop_async(*bit_extract_buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
#ifndef BIT_EXTRACT_H
|
||||
#define BIT_EXTRACT_H
|
||||
#ifndef BIT_EXTRACT_CUH
|
||||
#define BIT_EXTRACT_CUH
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "bit_extraction.h"
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
#include "device.h"
|
||||
#include "keyswitch.cuh"
|
||||
@@ -127,6 +127,56 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_extract_bits(uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t number_of_inputs) {
|
||||
|
||||
return sizeof(Torus) * number_of_inputs // lut_vector_indexes
|
||||
+ ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs
|
||||
+ (glwe_dimension * polynomial_size + 1) *
|
||||
sizeof(Torus) // lwe_array_in_buffer
|
||||
+ (glwe_dimension * polynomial_size + 1) *
|
||||
sizeof(Torus) // lwe_array_in_shifted_buffer
|
||||
+ (lwe_dimension + 1) * sizeof(Torus) // lwe_array_out_ks_buffer
|
||||
+ (glwe_dimension * polynomial_size + 1) *
|
||||
sizeof(Torus); // lwe_array_out_pbs_buffer
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void
|
||||
scratch_extract_bits(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **bit_extract_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size =
|
||||
get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, number_of_inputs) +
|
||||
get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count, number_of_inputs,
|
||||
max_shared_memory);
|
||||
// allocate and initialize device pointers for bit extraction
|
||||
if (allocate_gpu_memory) {
|
||||
*bit_extract_buffer =
|
||||
(int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
}
|
||||
// lut_vector_indexes is the last buffer in the bit_extract_buffer
|
||||
// it's hard set to 0: only one LUT is given as input, it's the same for all
|
||||
// LWE inputs For simplicity we initialize the whole buffer to 0
|
||||
check_cuda_error(
|
||||
cudaMemsetAsync(*bit_extract_buffer, 0, buffer_size, *stream));
|
||||
|
||||
scratch_bootstrap_low_latency<Torus, STorus, params>(
|
||||
v_stream, gpu_index, bit_extract_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, number_of_inputs, max_shared_memory, false);
|
||||
}
|
||||
|
||||
/*
|
||||
* Host function for cuda extract bits.
|
||||
* it executes device functions in specific order and manages
|
||||
@@ -135,14 +185,12 @@ __global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value) {
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_extract_bits(
|
||||
void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
|
||||
Torus *lwe_array_in, Torus *lwe_array_in_buffer,
|
||||
Torus *lwe_array_in_shifted_buffer, Torus *lwe_array_out_ks_buffer,
|
||||
Torus *lwe_array_out_pbs_buffer, Torus *lut_pbs, Torus *lut_vector_indexes,
|
||||
Torus *ksk, double2 *fourier_bsk, uint32_t number_of_bits,
|
||||
uint32_t delta_log, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
Torus *lwe_array_in, int8_t *bit_extract_buffer, Torus *ksk,
|
||||
double2 *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log_bsk,
|
||||
uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk,
|
||||
uint32_t number_of_samples, uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
@@ -151,10 +199,34 @@ __host__ void host_extract_bits(
|
||||
int blocks = 1;
|
||||
int threads = params::degree / params::opt;
|
||||
|
||||
// Always define the PBS buffer first, because it has the strongest memory
|
||||
// alignment requirement (16 bytes for double2)
|
||||
int8_t *pbs_buffer = (int8_t *)bit_extract_buffer;
|
||||
Torus *lut_pbs =
|
||||
(Torus *)pbs_buffer +
|
||||
(ptrdiff_t)(get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count_bsk,
|
||||
number_of_samples, max_shared_memory) /
|
||||
sizeof(Torus));
|
||||
Torus *lwe_array_in_buffer =
|
||||
(Torus *)lut_pbs + (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
|
||||
Torus *lwe_array_in_shifted_buffer =
|
||||
(Torus *)lwe_array_in_buffer +
|
||||
(ptrdiff_t)(glwe_dimension * polynomial_size + 1);
|
||||
Torus *lwe_array_out_ks_buffer =
|
||||
(Torus *)lwe_array_in_shifted_buffer +
|
||||
(ptrdiff_t)(glwe_dimension * polynomial_size + 1);
|
||||
Torus *lwe_array_out_pbs_buffer =
|
||||
(Torus *)lwe_array_out_ks_buffer + (ptrdiff_t)(lwe_dimension_out + 1);
|
||||
// lut_vector_indexes is the last array in the bit_extract buffer
|
||||
Torus *lut_vector_indexes =
|
||||
(Torus *)lwe_array_out_pbs_buffer +
|
||||
(ptrdiff_t)((glwe_dimension * polynomial_size + 1) * sizeof(Torus));
|
||||
|
||||
// shift lwe on padding bit and copy in new buffer
|
||||
copy_and_shift_lwe<Torus, params><<<blocks, threads, 0, *stream>>>(
|
||||
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_in,
|
||||
1ll << (ciphertext_n_bits - delta_log - 1));
|
||||
(Torus)(1ll << (ciphertext_n_bits - delta_log - 1)));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
|
||||
@@ -173,9 +245,9 @@ __host__ void host_extract_bits(
|
||||
}
|
||||
|
||||
// Add q/4 to center the error while computing a negacyclic LUT
|
||||
add_to_body<Torus><<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer,
|
||||
lwe_dimension_out,
|
||||
1ll << (ciphertext_n_bits - 2));
|
||||
add_to_body<Torus>
|
||||
<<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer, lwe_dimension_out,
|
||||
(Torus)(1ll << (ciphertext_n_bits - 2)));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
// Fill lut for the current bit (equivalent to trivial encryption as mask is
|
||||
@@ -183,12 +255,12 @@ __host__ void host_extract_bits(
|
||||
// delta*2^{bit_idx-1}
|
||||
fill_lut_body_for_current_bit<Torus, params>
|
||||
<<<blocks, threads, 0, *stream>>>(
|
||||
lut_pbs, 0ll - 1ll << (delta_log - 1 + bit_idx));
|
||||
lut_pbs, (Torus)(0ll - 1ll << (delta_log - 1 + bit_idx)));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
host_bootstrap_low_latency<Torus, params>(
|
||||
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
|
||||
lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk,
|
||||
lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk, pbs_buffer,
|
||||
glwe_dimension, lwe_dimension_out, lwe_dimension_in, base_log_bsk,
|
||||
level_count_bsk, number_of_samples, 1, max_shared_memory);
|
||||
|
||||
@@ -196,10 +268,10 @@ __host__ void host_extract_bits(
|
||||
// of 0 if the extracted bit was 0 and 1 in the other case
|
||||
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
|
||||
lwe_array_in_shifted_buffer, lwe_array_in_buffer,
|
||||
lwe_array_out_pbs_buffer, 1ll << (delta_log - 1 + bit_idx),
|
||||
1ll << (ciphertext_n_bits - delta_log - bit_idx - 2));
|
||||
lwe_array_out_pbs_buffer, (Torus)(1ll << (delta_log - 1 + bit_idx)),
|
||||
(Torus)(1ll << (ciphertext_n_bits - delta_log - bit_idx - 2)));
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
#endif // BIT_EXTRACT_H
|
||||
#endif // BIT_EXTRACT_CUH
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 32 bits inputs, into `cmux_tree_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
* the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer,
|
||||
@@ -51,8 +52,9 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the amortized PBS on 64 bits inputs, into `cmux_tree_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
* the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer,
|
||||
|
||||
@@ -1,5 +1,113 @@
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_low_latency_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<256>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the low_latency PBS on 64 bits inputs, into `pbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
*/
|
||||
void scratch_cuda_bootstrap_low_latency_64(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<256>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 512:
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
|
||||
* This function performs best for small numbers of inputs. Beyond a certain
|
||||
* number of inputs (the exact number depends on the cryptographic parameters),
|
||||
@@ -10,9 +118,10 @@
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 32",
|
||||
base_log <= 32));
|
||||
@@ -38,7 +147,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_low_latency<uint32_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -46,7 +155,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_low_latency<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -54,7 +163,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -62,7 +171,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -70,7 +179,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -78,7 +187,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -166,9 +275,10 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
|
||||
void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory) {
|
||||
int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 64",
|
||||
base_log <= 64));
|
||||
@@ -194,7 +304,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_low_latency<uint64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -202,7 +312,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_low_latency<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -210,7 +320,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -218,7 +328,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -226,7 +336,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -234,7 +344,7 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
@@ -242,3 +352,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data for the low latency PBS on GPU in
|
||||
* pbs_buffer for 32 or 64 bits inputs.
|
||||
*/
|
||||
void cleanup_cuda_bootstrap_low_latency(void *v_stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer) {
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
// Free memory
|
||||
cuda_drop_async(*pbs_buffer, stream, gpu_index);
|
||||
}
|
||||
|
||||
@@ -131,10 +131,11 @@ template <typename Torus, class params, sharedMemDegree SMD>
|
||||
* Each y-block computes one element of the lwe_array_out.
|
||||
*/
|
||||
__global__ void device_bootstrap_low_latency(
|
||||
Torus *lwe_array_out, Torus *lut_vector, Torus *lwe_array_in,
|
||||
double2 *bootstrapping_key, double2 *join_buffer, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, int device_memory_size_per_block) {
|
||||
Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
|
||||
Torus *lwe_array_in, double2 *bootstrapping_key, double2 *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
int device_memory_size_per_block) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
@@ -152,26 +153,28 @@ __global__ void device_bootstrap_low_latency(
|
||||
else
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
// We always compute the pointer with most restrictive alignment to avoid
|
||||
// alignment issues
|
||||
double2 *accumulator_fft = (double2 *)selected_memory;
|
||||
Torus *accumulator =
|
||||
(Torus *)accumulator_fft +
|
||||
(ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
|
||||
Torus *accumulator_rotated =
|
||||
(Torus *)accumulator + (ptrdiff_t)polynomial_size;
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator_rotated +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
auto block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
|
||||
Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
|
||||
|
||||
auto block_lut_vector =
|
||||
&lut_vector[blockIdx.z * params::degree * (glwe_dimension + 1)];
|
||||
Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
|
||||
auto block_join_buffer =
|
||||
double2 *block_join_buffer =
|
||||
&join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
|
||||
params::degree / 2];
|
||||
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
@@ -241,6 +244,82 @@ __global__ void device_bootstrap_low_latency(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_full_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
|
||||
return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int get_buffer_size_bootstrap_low_latency(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
|
||||
|
||||
int full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
int partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
int partial_dm = full_sm - partial_sm;
|
||||
int full_dm = full_sm;
|
||||
int device_mem = 0;
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_mem = full_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
|
||||
(glwe_dimension + 1);
|
||||
}
|
||||
return device_mem + (glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count * polynomial_size / 2 *
|
||||
sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
__host__ void scratch_bootstrap_low_latency(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
|
||||
bool allocate_gpu_memory) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
int partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory);
|
||||
*pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Host wrapper to the low latency version
|
||||
* of bootstrapping
|
||||
@@ -249,94 +328,72 @@ template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_low_latency(
|
||||
void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
|
||||
Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
double2 *buffer_fft = (double2 *)cuda_malloc_async(
|
||||
(glwe_dimension + 1) * buffer_size_per_gpu, stream, gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
int full_sm =
|
||||
get_buffer_size_full_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
|
||||
int SM_PART =
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
int partial_sm =
|
||||
get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
|
||||
|
||||
int DM_FULL = SM_FULL;
|
||||
int full_dm = full_sm;
|
||||
|
||||
int DM_PART = DM_FULL - SM_PART;
|
||||
int partial_dm = full_dm - partial_sm;
|
||||
|
||||
int8_t *d_mem;
|
||||
int8_t *d_mem = pbs_buffer;
|
||||
double2 *buffer_fft =
|
||||
(double2 *)d_mem +
|
||||
(ptrdiff_t)(get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory) /
|
||||
sizeof(double2) -
|
||||
(glwe_dimension + 1) * level_count *
|
||||
input_lwe_ciphertext_count * polynomial_size / 2);
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
|
||||
|
||||
void *kernel_args[11];
|
||||
void *kernel_args[12];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lut_vector;
|
||||
kernel_args[2] = &lwe_array_in;
|
||||
kernel_args[3] = &bootstrapping_key;
|
||||
kernel_args[4] = &buffer_fft;
|
||||
kernel_args[5] = &lwe_dimension;
|
||||
kernel_args[6] = &polynomial_size;
|
||||
kernel_args[7] = &base_log;
|
||||
kernel_args[8] = &level_count;
|
||||
kernel_args[9] = &d_mem;
|
||||
kernel_args[2] = &lut_vector_indexes;
|
||||
kernel_args[3] = &lwe_array_in;
|
||||
kernel_args[4] = &bootstrapping_key;
|
||||
kernel_args[5] = &buffer_fft;
|
||||
kernel_args[6] = &lwe_dimension;
|
||||
kernel_args[7] = &polynomial_size;
|
||||
kernel_args[8] = &base_log;
|
||||
kernel_args[9] = &level_count;
|
||||
kernel_args[10] = &d_mem;
|
||||
|
||||
if (max_shared_memory < SM_PART) {
|
||||
kernel_args[10] = &DM_FULL;
|
||||
check_cuda_error(cudaGetLastError());
|
||||
d_mem = (int8_t *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
|
||||
level_count * (glwe_dimension + 1),
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
if (max_shared_memory < partial_sm) {
|
||||
kernel_args[11] = &full_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
|
||||
(void **)kernel_args, 0, *stream));
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
kernel_args[10] = &DM_PART;
|
||||
d_mem = (int8_t *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
|
||||
level_count * (glwe_dimension + 1),
|
||||
stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
kernel_args[11] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, PARTIALSM>, grid,
|
||||
thds, (void **)kernel_args, SM_PART, *stream));
|
||||
thds, (void **)kernel_args, partial_sm, *stream));
|
||||
|
||||
} else {
|
||||
int DM_NONE = 0;
|
||||
kernel_args[10] = &DM_NONE;
|
||||
d_mem = (int8_t *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
|
||||
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
int no_dm = 0;
|
||||
kernel_args[11] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, FULLSM>, grid, thds,
|
||||
(void **)kernel_args, SM_FULL, *stream));
|
||||
(void **)kernel_args, full_sm, *stream));
|
||||
}
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cuda_drop_async(buffer_fft, stream, gpu_index);
|
||||
cuda_drop_async(d_mem, stream, gpu_index);
|
||||
}
|
||||
|
||||
#endif // LOWLAT_PBS_H
|
||||
|
||||
@@ -50,7 +50,7 @@ void scratch_cuda_circuit_bootstrap_32(
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
|
||||
* the circuit bootstrap on 64 bits inputs, into `cbs_buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM mode is going to be used.
|
||||
*/
|
||||
void scratch_cuda_circuit_bootstrap_64(
|
||||
|
||||
@@ -2,7 +2,6 @@
|
||||
#define CBS_CUH
|
||||
|
||||
#include "bit_extraction.cuh"
|
||||
#include "bootstrap.h"
|
||||
#include "bootstrap_amortized.cuh"
|
||||
#include "device.h"
|
||||
#include "keyswitch.cuh"
|
||||
@@ -124,8 +123,7 @@ __host__ void scratch_circuit_bootstrap(
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int pbs_count = number_of_inputs * level_count_cbs;
|
||||
// allocate and initialize device pointers for circuit bootstrap and vertical
|
||||
// packing
|
||||
// allocate and initialize device pointers for circuit bootstrap
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size =
|
||||
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
|
||||
@@ -165,20 +163,24 @@ __host__ void host_circuit_bootstrap(
|
||||
dim3 blocks(level_cbs, number_of_inputs, 1);
|
||||
int threads = 256;
|
||||
|
||||
Torus *lwe_array_in_fp_ks_buffer = (Torus *)cbs_buffer;
|
||||
// Always define the PBS buffer first, because it has the strongest memory
|
||||
// alignment requirement (16 bytes for double2)
|
||||
int8_t *pbs_buffer = (int8_t *)cbs_buffer;
|
||||
Torus *lwe_array_out_pbs_buffer =
|
||||
(Torus *)lwe_array_in_fp_ks_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_cbs * (glwe_dimension + 1) *
|
||||
(polynomial_size + 1));
|
||||
(Torus *)pbs_buffer +
|
||||
(ptrdiff_t)(
|
||||
get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size, pbs_count, max_shared_memory) /
|
||||
sizeof(Torus));
|
||||
Torus *lwe_array_in_shifted_buffer =
|
||||
(Torus *)lwe_array_out_pbs_buffer +
|
||||
lwe_array_out_pbs_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_cbs * (polynomial_size + 1));
|
||||
Torus *lut_vector =
|
||||
(Torus *)lwe_array_in_shifted_buffer +
|
||||
lwe_array_in_shifted_buffer +
|
||||
(ptrdiff_t)(number_of_inputs * level_cbs * (lwe_dimension + 1));
|
||||
int8_t *pbs_buffer =
|
||||
(int8_t *)lut_vector + (ptrdiff_t)(level_cbs * (glwe_dimension + 1) *
|
||||
polynomial_size * sizeof(Torus));
|
||||
Torus *lwe_array_in_fp_ks_buffer =
|
||||
lut_vector +
|
||||
(ptrdiff_t)(level_cbs * (glwe_dimension + 1) * polynomial_size);
|
||||
|
||||
// Shift message LSB on padding bit, at this point we expect to have messages
|
||||
// with only 1 bit of information
|
||||
|
||||
@@ -108,44 +108,50 @@ void scratch_cuda_wop_pbs_32(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t level_count_bsk,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_wop_pbs<uint32_t, int32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -162,44 +168,50 @@ void scratch_cuda_wop_pbs_64(
|
||||
void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
uint32_t *delta_log, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t level_count_bsk,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 1024:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 2048:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 4096:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 8192:
|
||||
scratch_wop_pbs<uint64_t, int64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
level_count_bsk, number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -191,20 +191,11 @@ __host__ void host_circuit_bootstrap_vertical_packing(
|
||||
|
||||
template <typename Torus>
|
||||
__host__ __device__ int
|
||||
get_buffer_size_wop_pbs(uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract,
|
||||
uint32_t number_of_inputs) {
|
||||
get_buffer_size_wop_pbs(uint32_t lwe_dimension,
|
||||
uint32_t number_of_bits_of_message_including_padding) {
|
||||
|
||||
return sizeof(Torus) // lut_vector_indexes
|
||||
+ ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs
|
||||
+ (polynomial_size + 1) * sizeof(Torus) // lwe_array_in_buffer
|
||||
+ (polynomial_size + 1) * sizeof(Torus) // lwe_array_in_shifted_buffer
|
||||
+ (lwe_dimension + 1) * sizeof(Torus) // lwe_array_out_ks_buffer
|
||||
+ (polynomial_size + 1) * sizeof(Torus) // lwe_array_out_pbs_buffer
|
||||
+ (lwe_dimension + 1) * // lwe_array_out_bit_extract
|
||||
(number_of_bits_of_message_including_padding) * sizeof(Torus);
|
||||
return (lwe_dimension + 1) * (number_of_bits_of_message_including_padding) *
|
||||
sizeof(Torus); // lwe_array_out_bit_extract
|
||||
}
|
||||
|
||||
template <typename Torus, typename STorus, typename params>
|
||||
@@ -213,52 +204,64 @@ scratch_wop_pbs(void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
|
||||
uint32_t *delta_log, uint32_t *cbs_delta_log,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count_cbs,
|
||||
uint32_t level_count_bsk,
|
||||
uint32_t number_of_bits_of_message_including_padding,
|
||||
uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
|
||||
uint32_t max_shared_memory) {
|
||||
uint32_t max_shared_memory, bool allocate_gpu_memory) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int wop_pbs_buffer_size = get_buffer_size_wop_pbs<Torus>(
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding, number_of_bits_to_extract,
|
||||
number_of_inputs);
|
||||
int bit_extract_buffer_size =
|
||||
get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, number_of_inputs) +
|
||||
get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count_bsk, number_of_inputs,
|
||||
max_shared_memory);
|
||||
uint32_t cbs_vp_number_of_inputs =
|
||||
number_of_inputs * number_of_bits_to_extract;
|
||||
uint32_t tau = number_of_inputs;
|
||||
uint32_t r = cbs_vp_number_of_inputs - params::log2_degree;
|
||||
uint32_t mbr_size = cbs_vp_number_of_inputs - r;
|
||||
int buffer_size =
|
||||
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, tau,
|
||||
cbs_vp_number_of_inputs) +
|
||||
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension, polynomial_size,
|
||||
level_count_cbs, cbs_vp_number_of_inputs) +
|
||||
get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size,
|
||||
cbs_vp_number_of_inputs * level_count_cbs, max_shared_memory) +
|
||||
get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, r, tau,
|
||||
max_shared_memory) +
|
||||
get_buffer_size_blind_rotation_sample_extraction<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count_cbs, mbr_size, tau,
|
||||
max_shared_memory) +
|
||||
wop_pbs_buffer_size;
|
||||
if (allocate_gpu_memory) {
|
||||
int buffer_size =
|
||||
bit_extract_buffer_size +
|
||||
get_buffer_size_wop_pbs<Torus>(
|
||||
lwe_dimension, number_of_bits_of_message_including_padding) +
|
||||
get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, tau,
|
||||
cbs_vp_number_of_inputs) +
|
||||
get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_cbs,
|
||||
cbs_vp_number_of_inputs) +
|
||||
get_buffer_size_bootstrap_amortized<Torus>(
|
||||
glwe_dimension, polynomial_size,
|
||||
cbs_vp_number_of_inputs * level_count_cbs, max_shared_memory) +
|
||||
get_buffer_size_cmux_tree<Torus>(glwe_dimension, polynomial_size,
|
||||
level_count_cbs, r, tau,
|
||||
max_shared_memory) +
|
||||
get_buffer_size_blind_rotation_sample_extraction<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count_cbs, mbr_size, tau,
|
||||
max_shared_memory);
|
||||
|
||||
*wop_pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// indexes of lut vectors for bit extract
|
||||
Torus h_lut_vector_indexes = 0;
|
||||
// lut_vector_indexes is the first array in the wop_pbs buffer
|
||||
cuda_memcpy_async_to_gpu(*wop_pbs_buffer, (int8_t *)&h_lut_vector_indexes,
|
||||
sizeof(Torus), stream, gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
*wop_pbs_buffer =
|
||||
(int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
}
|
||||
uint32_t ciphertext_total_bits_count = sizeof(Torus) * 8;
|
||||
*delta_log =
|
||||
ciphertext_total_bits_count - number_of_bits_of_message_including_padding;
|
||||
|
||||
int8_t *bit_extract_buffer =
|
||||
(int8_t *)*wop_pbs_buffer +
|
||||
(ptrdiff_t)(get_buffer_size_wop_pbs<Torus>(
|
||||
lwe_dimension, number_of_bits_of_message_including_padding));
|
||||
scratch_extract_bits<Torus, STorus, params>(
|
||||
v_stream, gpu_index, &bit_extract_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, level_count_bsk, number_of_inputs, max_shared_memory,
|
||||
false);
|
||||
|
||||
int8_t *cbs_vp_buffer =
|
||||
(int8_t *)*wop_pbs_buffer + (ptrdiff_t)wop_pbs_buffer_size;
|
||||
bit_extract_buffer + (ptrdiff_t)bit_extract_buffer_size;
|
||||
scratch_circuit_bootstrap_vertical_packing<Torus, STorus, params>(
|
||||
v_stream, gpu_index, &cbs_vp_buffer, cbs_delta_log, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, level_count_cbs,
|
||||
@@ -290,41 +293,34 @@ __host__ void host_wop_pbs(
|
||||
uint32_t number_of_bits_to_extract, uint32_t delta_log,
|
||||
uint32_t number_of_inputs, uint32_t max_shared_memory) {
|
||||
|
||||
// lut_vector_indexes is the first array in the wop_pbs buffer
|
||||
Torus *lut_vector_indexes = (Torus *)wop_pbs_buffer;
|
||||
Torus *lut_pbs = (Torus *)lut_vector_indexes + (ptrdiff_t)(1);
|
||||
Torus *lwe_array_in_buffer =
|
||||
(Torus *)lut_pbs + (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
|
||||
Torus *lwe_array_in_shifted_buffer =
|
||||
(Torus *)lwe_array_in_buffer + (ptrdiff_t)(polynomial_size + 1);
|
||||
Torus *lwe_array_out_ks_buffer =
|
||||
(Torus *)lwe_array_in_shifted_buffer + (ptrdiff_t)(polynomial_size + 1);
|
||||
Torus *lwe_array_out_pbs_buffer =
|
||||
(Torus *)lwe_array_out_ks_buffer + (ptrdiff_t)(lwe_dimension + 1);
|
||||
Torus *lwe_array_out_bit_extract =
|
||||
(Torus *)lwe_array_out_pbs_buffer + (ptrdiff_t)(polynomial_size + 1);
|
||||
int8_t *bit_extract_buffer = wop_pbs_buffer;
|
||||
int8_t *lwe_array_out_bit_extract =
|
||||
bit_extract_buffer +
|
||||
(ptrdiff_t)(get_buffer_size_extract_bits<Torus>(
|
||||
glwe_dimension, lwe_dimension, polynomial_size,
|
||||
number_of_inputs) +
|
||||
get_buffer_size_bootstrap_low_latency<Torus>(
|
||||
glwe_dimension, polynomial_size, level_count_bsk,
|
||||
number_of_inputs, max_shared_memory));
|
||||
host_extract_bits<Torus, params>(
|
||||
v_stream, gpu_index, lwe_array_out_bit_extract, lwe_array_in,
|
||||
lwe_array_in_buffer, lwe_array_in_shifted_buffer, lwe_array_out_ks_buffer,
|
||||
lwe_array_out_pbs_buffer, lut_pbs, lut_vector_indexes, ksk, fourier_bsk,
|
||||
number_of_bits_to_extract, delta_log, polynomial_size, lwe_dimension,
|
||||
glwe_dimension, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
v_stream, gpu_index, (Torus *)lwe_array_out_bit_extract, lwe_array_in,
|
||||
bit_extract_buffer, ksk, fourier_bsk, number_of_bits_to_extract,
|
||||
delta_log, polynomial_size, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
|
||||
level_count_ksk, number_of_inputs, max_shared_memory);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
int8_t *cbs_vp_buffer =
|
||||
(int8_t *)wop_pbs_buffer +
|
||||
(ptrdiff_t)get_buffer_size_wop_pbs<Torus>(
|
||||
glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
|
||||
number_of_bits_of_message_including_padding,
|
||||
number_of_bits_to_extract, number_of_inputs);
|
||||
lwe_array_out_bit_extract +
|
||||
(ptrdiff_t)(get_buffer_size_wop_pbs<Torus>(
|
||||
lwe_dimension, number_of_bits_of_message_including_padding));
|
||||
host_circuit_bootstrap_vertical_packing<Torus, STorus, params>(
|
||||
v_stream, gpu_index, lwe_array_out, lwe_array_out_bit_extract, lut_vector,
|
||||
fourier_bsk, cbs_fpksk, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
|
||||
base_log_pksk, level_count_pksk, base_log_cbs, level_count_cbs,
|
||||
number_of_inputs * number_of_bits_to_extract, number_of_inputs,
|
||||
max_shared_memory);
|
||||
v_stream, gpu_index, lwe_array_out, (Torus *)lwe_array_out_bit_extract,
|
||||
lut_vector, fourier_bsk, cbs_fpksk, cbs_vp_buffer, cbs_delta_log,
|
||||
glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk,
|
||||
level_count_bsk, base_log_pksk, level_count_pksk, base_log_cbs,
|
||||
level_count_cbs, number_of_inputs * number_of_bits_to_extract,
|
||||
number_of_inputs, max_shared_memory);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
#endif // WOP_PBS_H
|
||||
|
||||
Reference in New Issue
Block a user