mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-11 07:38:08 -05:00
Compare commits
2 Commits
al/fix_gpu
...
al/remove_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47e847dab4 | ||
|
|
c6ff00c500 |
@@ -18,7 +18,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
|
||||
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, void *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_glwes,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size);
|
||||
};
|
||||
|
||||
@@ -8,7 +8,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
@@ -17,7 +17,7 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
@@ -96,7 +96,7 @@ template <typename Torus> struct int_decompression {
|
||||
|
||||
uint32_t storage_log_modulus;
|
||||
|
||||
uint32_t num_lwes;
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t body_count;
|
||||
|
||||
Torus *tmp_extracted_glwe;
|
||||
@@ -113,7 +113,7 @@ template <typename Torus> struct int_decompression {
|
||||
this->encryption_params = encryption_params;
|
||||
this->compression_params = compression_params;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->num_lwes = num_radix_blocks;
|
||||
this->num_radix_blocks = num_radix_blocks;
|
||||
this->body_count = body_count;
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
@@ -134,7 +134,7 @@ template <typename Torus> struct int_decompression {
|
||||
tmp_extracted_lwe = (Torus *)cuda_malloc_async(
|
||||
num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
// Decompression
|
||||
|
||||
// Carry extract LUT
|
||||
auto carry_extract_f = [encryption_params](Torus x) -> Torus {
|
||||
return x / encryption_params.message_modulus;
|
||||
@@ -157,7 +157,7 @@ template <typename Torus> struct int_decompression {
|
||||
cuda_drop_async(tmp_indexes_array, streams[0], gpu_indexes[0]);
|
||||
|
||||
carry_extract_lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete (carry_extract_lut);
|
||||
delete carry_extract_lut;
|
||||
}
|
||||
};
|
||||
#endif
|
||||
|
||||
@@ -69,7 +69,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
|
||||
bool do_modulus_switch);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
@@ -77,7 +78,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride);
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
|
||||
bool do_modulus_switch);
|
||||
|
||||
void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
@@ -332,7 +334,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
@@ -342,7 +344,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
@@ -353,7 +355,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
template <typename Torus>
|
||||
void scratch_cuda_programmable_bootstrap_tbc(
|
||||
|
||||
@@ -23,7 +23,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *stream,
|
||||
|
||||
void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
void *lwe_array_out, void *glwe_array_in,
|
||||
uint32_t *nth_array, uint32_t num_glwes,
|
||||
uint32_t *nth_array, uint32_t num_nths,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size) {
|
||||
|
||||
@@ -31,43 +31,43 @@ void cuda_glwe_sample_extract_64(void *stream, uint32_t gpu_index,
|
||||
case 256:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 512:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 1024:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 2048:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 4096:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 8192:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
case 16384:
|
||||
host_sample_extract<uint64_t, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (uint64_t *)lwe_array_out,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_glwes,
|
||||
(uint64_t *)glwe_array_in, (uint32_t *)nth_array, num_nths,
|
||||
glwe_dimension);
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -4,7 +4,7 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
@@ -16,15 +16,16 @@ void scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
|
||||
scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
|
||||
(int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
|
||||
compression_params, lwe_per_glwe, storage_log_modulus,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
|
||||
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
|
||||
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
|
||||
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
|
||||
PBS_TYPE pbs_type, uint32_t storage_log_modulus, uint32_t body_count,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
@@ -41,7 +42,7 @@ void scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
|
||||
scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_decompression<uint64_t> **)mem_ptr, num_lwes, body_count,
|
||||
(int_decompression<uint64_t> **)mem_ptr, num_radix_blocks, body_count,
|
||||
encryption_params, compression_params, storage_log_modulus,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
||||
uint32_t num_glwes, uint32_t in_len, uint32_t out_len) {
|
||||
uint32_t num_coeffs, uint32_t in_len, uint32_t out_len) {
|
||||
auto nbits = sizeof(Torus) * 8;
|
||||
auto tid = threadIdx.x + blockIdx.x * blockDim.x;
|
||||
|
||||
@@ -21,7 +21,7 @@ __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
||||
auto chunk_array_in = array_in + glwe_index * in_len;
|
||||
auto chunk_array_out = array_out + glwe_index * out_len;
|
||||
|
||||
if (tid < num_glwes * out_len) {
|
||||
if (tid < num_coeffs) {
|
||||
|
||||
auto k = nbits * i / log_modulus;
|
||||
auto j = k;
|
||||
@@ -44,11 +44,15 @@ __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
||||
template <typename Torus>
|
||||
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *array_out, Torus *array_in, uint32_t num_glwes,
|
||||
int_compression<Torus> *mem_ptr) {
|
||||
uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
|
||||
if (array_in == array_out)
|
||||
PANIC("Cuda error: Input and output must be different");
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
// [0..num_glwes-1) GLWEs
|
||||
auto in_len = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
auto number_bits_to_pack = in_len * log_modulus;
|
||||
@@ -56,20 +60,35 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
// number_bits_to_pack.div_ceil(Scalar::BITS)
|
||||
auto out_len = (number_bits_to_pack + nbits - 1) / nbits;
|
||||
|
||||
// Last GLWE
|
||||
auto last_body_count = num_lwes % compression_params.polynomial_size;
|
||||
in_len =
|
||||
compression_params.glwe_dimension * compression_params.polynomial_size +
|
||||
last_body_count;
|
||||
number_bits_to_pack = in_len * log_modulus;
|
||||
auto last_out_len = (number_bits_to_pack + nbits - 1) / nbits;
|
||||
|
||||
auto num_coeffs = (num_glwes - 1) * out_len + last_out_len;
|
||||
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
getNumBlocksAndThreads(num_glwes * out_len, 1024, num_blocks, num_threads);
|
||||
getNumBlocksAndThreads(num_coeffs, 1024, num_blocks, num_threads);
|
||||
|
||||
dim3 grid(num_blocks);
|
||||
dim3 threads(num_threads);
|
||||
cuda_memset_async(array_out, 0,
|
||||
num_glwes * (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size * sizeof(Torus),
|
||||
stream, gpu_index);
|
||||
pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
|
||||
num_glwes, in_len, out_len);
|
||||
num_coeffs, in_len, out_len);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_compress(cudaStream_t *streams,
|
||||
uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *glwe_array_out, Torus *lwe_array_in,
|
||||
Torus **fp_ksk, uint32_t num_lwes,
|
||||
Torus **fp_ksk, uint32_t num_radix_blocks,
|
||||
int_compression<Torus> *mem_ptr) {
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
@@ -80,21 +99,23 @@ __host__ void host_integer_compress(cudaStream_t *streams,
|
||||
host_cleartext_multiplication<Torus>(
|
||||
streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
|
||||
(uint64_t)compression_params.message_modulus, input_lwe_dimension,
|
||||
num_lwes);
|
||||
num_radix_blocks);
|
||||
|
||||
uint32_t lwe_in_size = input_lwe_dimension + 1;
|
||||
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
|
||||
uint32_t num_glwes_for_compression =
|
||||
num_radix_blocks / mem_ptr->lwe_per_glwe + 1;
|
||||
|
||||
// Keyswitch LWEs to GLWE
|
||||
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
|
||||
cuda_memset_async(tmp_glwe_array_out, 0,
|
||||
num_glwes * (compression_params.glwe_dimension + 1) *
|
||||
num_glwes_for_compression *
|
||||
(compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
|
||||
auto rem_lwes = num_lwes;
|
||||
auto rem_lwes = num_radix_blocks;
|
||||
|
||||
auto lwe_subset = lwe_shifted;
|
||||
auto glwe_out = tmp_glwe_array_out;
|
||||
@@ -115,13 +136,13 @@ __host__ void host_integer_compress(cudaStream_t *streams,
|
||||
// Modulus switch
|
||||
host_modulus_switch_inplace<Torus>(
|
||||
streams[0], gpu_indexes[0], tmp_glwe_array_out,
|
||||
num_glwes * (compression_params.glwe_dimension + 1) *
|
||||
num_glwes_for_compression * (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size,
|
||||
mem_ptr->storage_log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
|
||||
tmp_glwe_array_out, num_glwes, mem_ptr);
|
||||
tmp_glwe_array_out, num_glwes_for_compression,
|
||||
num_radix_blocks, mem_ptr);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -160,11 +181,15 @@ __global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the glwe_index-nth GLWE ciphertext
|
||||
template <typename Torus>
|
||||
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *glwe_array_out, Torus *array_in,
|
||||
uint32_t glwe_index,
|
||||
int_decompression<Torus> *mem_ptr) {
|
||||
if (array_in == glwe_array_out)
|
||||
PANIC("Cuda error: Input and output must be different");
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
@@ -221,7 +246,10 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
"be smaller than "
|
||||
"polynomial_size.")
|
||||
|
||||
auto num_lwes = h_mem_ptr->num_lwes;
|
||||
auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
|
||||
if (num_radix_blocks != indexes_array_size)
|
||||
PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs "
|
||||
"should be the same as indexes_array_size.")
|
||||
|
||||
// the first element is the last index in h_indexes_array that lies in the
|
||||
// related GLWE
|
||||
@@ -251,23 +279,23 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
}
|
||||
}
|
||||
// Sample extract all LWEs
|
||||
Torus lwe_accumulator_size =
|
||||
(compression_params.glwe_dimension * compression_params.polynomial_size +
|
||||
1);
|
||||
Torus lwe_accumulator_size = compression_params.small_lwe_dimension + 1;
|
||||
|
||||
auto extracted_lwe = h_mem_ptr->tmp_extracted_lwe;
|
||||
uint32_t current_idx = 0;
|
||||
auto d_indexes_array_chunk = d_indexes_array;
|
||||
for (const auto &max_idx_and_glwe : glwe_vec) {
|
||||
uint32_t max_idx = max_idx_and_glwe.first;
|
||||
uint32_t last_idx = max_idx_and_glwe.first;
|
||||
extracted_glwe = max_idx_and_glwe.second;
|
||||
|
||||
cuda_glwe_sample_extract_64(
|
||||
streams[0], gpu_indexes[0], extracted_lwe, extracted_glwe,
|
||||
d_indexes_array, max_idx + 1 - current_idx,
|
||||
compression_params.glwe_dimension, compression_params.polynomial_size);
|
||||
|
||||
auto num_lwes = last_idx + 1 - current_idx;
|
||||
cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
|
||||
extracted_glwe, d_indexes_array_chunk, num_lwes,
|
||||
compression_params.glwe_dimension,
|
||||
compression_params.polynomial_size);
|
||||
d_indexes_array_chunk += num_lwes;
|
||||
extracted_lwe += lwe_accumulator_size;
|
||||
current_idx = max_idx;
|
||||
current_idx = last_idx;
|
||||
}
|
||||
|
||||
// Reset
|
||||
@@ -280,9 +308,8 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
/// dimension to a big LWE dimension
|
||||
auto encryption_params = h_mem_ptr->encryption_params;
|
||||
auto lut = h_mem_ptr->carry_extract_lut;
|
||||
auto active_gpu_count = get_active_gpu_count(num_lwes, gpu_count);
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
|
||||
@@ -291,7 +318,8 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
num_lwes, encryption_params.pbs_type, lut_count, lut_stride);
|
||||
num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride,
|
||||
false);
|
||||
} else {
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
@@ -306,7 +334,7 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
|
||||
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_lwes,
|
||||
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
|
||||
/// Apply PBS
|
||||
@@ -318,14 +346,15 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
num_lwes, encryption_params.pbs_type, lut_count, lut_stride);
|
||||
num_radix_blocks, encryption_params.pbs_type, lut_count, lut_stride,
|
||||
false);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
d_lwe_array_out, lwe_after_pbs_vec,
|
||||
lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, num_lwes,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
|
||||
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, num_radix_blocks,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
@@ -337,24 +366,25 @@ host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_compress_integer_radix_ciphertext(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_compression<Torus> **mem_ptr, uint32_t num_lwes,
|
||||
int_compression<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
int_radix_params compression_params, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_compression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_lwes,
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void scratch_cuda_integer_decompress_radix_ciphertext(
|
||||
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
|
||||
int_decompression<Torus> **mem_ptr, uint32_t num_lwes, uint32_t body_count,
|
||||
int_radix_params encryption_params, int_radix_params compression_params,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
int_decompression<Torus> **mem_ptr, uint32_t num_radix_blocks,
|
||||
uint32_t body_count, int_radix_params encryption_params,
|
||||
int_radix_params compression_params, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
*mem_ptr = new int_decompression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
|
||||
num_lwes, body_count, storage_log_modulus, allocate_gpu_memory);
|
||||
num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -214,7 +214,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
|
||||
true);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
@@ -241,7 +242,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
|
||||
lut_stride);
|
||||
lut_stride, true);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
@@ -298,7 +299,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
|
||||
true);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
@@ -325,7 +327,7 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
|
||||
lut_stride);
|
||||
lut_stride, true);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
@@ -394,7 +396,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride);
|
||||
grouping_factor, num_radix_blocks, pbs_type, lut_count, lut_stride,
|
||||
true);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
@@ -417,7 +420,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, grouping_factor, num_radix_blocks, pbs_type, lut_count,
|
||||
lut_stride);
|
||||
lut_stride, true);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
@@ -816,7 +819,8 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
|
||||
mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
|
||||
params.glwe_dimension, params.small_lwe_dimension,
|
||||
params.polynomial_size, params.pbs_base_log, params.pbs_level,
|
||||
params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride);
|
||||
params.grouping_factor, 2, params.pbs_type, lut_count, lut_stride,
|
||||
true);
|
||||
|
||||
cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
|
||||
big_lwe_size * sizeof(Torus), streams[0],
|
||||
|
||||
@@ -368,7 +368,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count,
|
||||
mem_ptr->params.pbs_type, lut_count, lut_stride);
|
||||
mem_ptr->params.pbs_type, lut_count, lut_stride, true);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
@@ -416,7 +416,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
glwe_dimension, small_lwe_dimension, polynomial_size,
|
||||
mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
|
||||
mem_ptr->params.grouping_factor, total_count,
|
||||
mem_ptr->params.pbs_type, lut_count, lut_stride);
|
||||
mem_ptr->params.pbs_type, lut_count, lut_stride, true);
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
|
||||
|
||||
@@ -128,7 +128,7 @@ void execute_pbs_async(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t grouping_factor,
|
||||
uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
uint32_t lut_stride, bool do_modulus_switch) {
|
||||
switch (sizeof(Torus)) {
|
||||
case sizeof(uint32_t):
|
||||
// 32 bits
|
||||
@@ -161,7 +161,7 @@ void execute_pbs_async(
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -229,7 +229,7 @@ void execute_pbs_async(
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
|
||||
@@ -45,7 +45,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
uint32_t lut_stride, bool do_modulus_switch) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
@@ -94,8 +94,11 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
if (do_modulus_switch)
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
else
|
||||
b_hat = block_lwe_array_in[lwe_dimension];
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
@@ -107,7 +110,10 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
|
||||
if (do_modulus_switch)
|
||||
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
|
||||
else
|
||||
a_hat = block_lwe_array_in[i];
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
@@ -210,7 +216,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lut_count, uint32_t lut_stride) {
|
||||
uint32_t lut_count, uint32_t lut_stride, bool do_modulus_switch) {
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
@@ -250,6 +256,7 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
kernel_args[12] = &d_mem;
|
||||
kernel_args[14] = &lut_count;
|
||||
kernel_args[15] = &lut_stride;
|
||||
kernel_args[16] = &do_modulus_switch;
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
kernel_args[13] = &full_dm;
|
||||
|
||||
@@ -123,7 +123,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
uint32_t lut_stride, bool do_modulus_switch) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -132,7 +132,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<512>>(
|
||||
@@ -140,7 +140,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
|
||||
@@ -148,7 +148,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
|
||||
@@ -156,7 +156,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
|
||||
@@ -164,7 +164,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
|
||||
@@ -172,7 +172,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
|
||||
@@ -180,7 +180,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -379,7 +379,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
uint32_t lut_stride, bool do_modulus_switch) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -388,7 +388,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_cg<Torus, Degree<512>>(
|
||||
@@ -396,7 +396,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_cg<Torus, Degree<1024>>(
|
||||
@@ -404,7 +404,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
|
||||
@@ -412,7 +412,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
|
||||
@@ -420,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
|
||||
@@ -428,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
|
||||
@@ -436,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -453,7 +453,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
uint32_t lut_stride, bool do_modulus_switch) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
@@ -462,7 +462,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap<Torus, Degree<512>>(
|
||||
@@ -470,7 +470,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap<Torus, Degree<1024>>(
|
||||
@@ -478,7 +478,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
@@ -486,7 +486,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
@@ -494,7 +494,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
@@ -502,7 +502,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
@@ -510,7 +510,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
@@ -527,7 +527,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
|
||||
bool do_modulus_switch) {
|
||||
|
||||
if (base_log > 32)
|
||||
PANIC("Cuda error (classical PBS): base log should be > number of bits "
|
||||
@@ -548,7 +549,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
@@ -563,7 +564,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
@@ -575,7 +576,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
static_cast<uint32_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
@@ -649,7 +650,8 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride) {
|
||||
uint32_t num_samples, uint32_t lut_count, uint32_t lut_stride,
|
||||
bool do_modulus_switch) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be > number of bits "
|
||||
"in the ciphertext representation (64)");
|
||||
@@ -669,7 +671,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
@@ -684,7 +686,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
@@ -696,7 +698,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
static_cast<uint64_t *>(lwe_input_indexes),
|
||||
static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
lut_count, lut_stride);
|
||||
lut_count, lut_stride, do_modulus_switch);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
@@ -725,7 +727,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
@@ -735,7 +737,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
@@ -756,7 +758,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
@@ -766,7 +768,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
|
||||
template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
@@ -795,7 +797,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t *lwe_output_indexes, uint64_t *lut_vector,
|
||||
@@ -804,7 +806,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t lut_count,
|
||||
uint32_t lut_stride);
|
||||
uint32_t lut_stride, bool do_modulus_switch);
|
||||
template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
|
||||
|
||||
@@ -27,7 +27,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
Torus *global_accumulator, double2 *global_accumulator_fft,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block,
|
||||
bool do_modulus_switch) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
@@ -75,8 +76,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// First iteration
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
if (do_modulus_switch)
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
else
|
||||
b_hat = block_lwe_array_in[lwe_dimension];
|
||||
// The y-dimension is used to select the element of the GLWE this block will
|
||||
// compute
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
@@ -94,8 +98,11 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
if (do_modulus_switch)
|
||||
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
else
|
||||
a_hat = block_lwe_array_in[lwe_iteration];
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
@@ -324,7 +331,7 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, int8_t *d_mem,
|
||||
int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
|
||||
uint64_t full_sm, uint64_t full_dm) {
|
||||
uint64_t full_sm, uint64_t full_dm, bool do_modulus_switch) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
cudaSetDevice(gpu_index);
|
||||
@@ -337,21 +344,21 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, full_dm);
|
||||
level_count, d_mem, full_dm, do_modulus_switch);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, partial_dm);
|
||||
level_count, d_mem, partial_dm, do_modulus_switch);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, global_accumulator, global_accumulator_fft,
|
||||
lwe_iteration, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, d_mem, 0);
|
||||
level_count, d_mem, 0, do_modulus_switch);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -407,7 +414,7 @@ __host__ void host_programmable_bootstrap(
|
||||
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lut_count, uint32_t lut_stride) {
|
||||
uint32_t lut_count, uint32_t lut_stride, bool do_modulus_switch) {
|
||||
cudaSetDevice(gpu_index);
|
||||
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
@@ -437,7 +444,8 @@ __host__ void host_programmable_bootstrap(
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
|
||||
do_modulus_switch);
|
||||
execute_step_two<Torus, params>(
|
||||
stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
|
||||
lut_vector_indexes, bootstrapping_key, global_accumulator,
|
||||
|
||||
@@ -45,7 +45,7 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block, bool support_dsm, uint32_t lut_count,
|
||||
uint32_t lut_stride) {
|
||||
uint32_t lut_stride, bool do_modulus_switch) {
|
||||
|
||||
cluster_group cluster = this_cluster();
|
||||
|
||||
@@ -97,8 +97,11 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
if (do_modulus_switch)
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
else
|
||||
b_hat = block_lwe_array_in[lwe_dimension];
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
@@ -110,8 +113,11 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[i], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
if (do_modulus_switch)
|
||||
modulus_switch(block_lwe_array_in[i], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
else
|
||||
a_hat = block_lwe_array_in[i];
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
@@ -216,7 +222,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t lut_count, uint32_t lut_stride) {
|
||||
uint32_t lut_count, uint32_t lut_stride, bool do_modulus_switch) {
|
||||
|
||||
auto supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
@@ -272,7 +278,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
|
||||
supports_dsm, lut_count, lut_stride));
|
||||
supports_dsm, lut_count, lut_stride, do_modulus_switch));
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
|
||||
|
||||
@@ -281,7 +287,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
partial_dm, supports_dsm, lut_count, lut_stride));
|
||||
partial_dm, supports_dsm, lut_count, lut_stride, do_modulus_switch));
|
||||
} else {
|
||||
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
|
||||
|
||||
@@ -290,7 +296,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
|
||||
supports_dsm, lut_count, lut_stride));
|
||||
supports_dsm, lut_count, lut_stride, do_modulus_switch));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ extern "C" {
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
num_lwes: u32,
|
||||
num_radix_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
@@ -117,7 +117,7 @@ extern "C" {
|
||||
lwe_dimension: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
num_lwes: u32,
|
||||
num_radix_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: u32,
|
||||
@@ -1170,6 +1170,7 @@ extern "C" {
|
||||
num_samples: u32,
|
||||
lut_count: u32,
|
||||
lut_stride: u32,
|
||||
do_modulus_switch: bool,
|
||||
);
|
||||
|
||||
pub fn cleanup_cuda_programmable_bootstrap(
|
||||
|
||||
@@ -141,6 +141,7 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
|
||||
num_samples,
|
||||
lut_count,
|
||||
lut_stride,
|
||||
true,
|
||||
);
|
||||
cleanup_cuda_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
|
||||
@@ -129,19 +129,19 @@ impl CudaCompressedCiphertextList {
|
||||
/// use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
|
||||
/// use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
|
||||
/// use tfhe::integer::gpu::gen_keys_radix_gpu;
|
||||
/// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
|
||||
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
|
||||
/// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
///
|
||||
/// let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
|
||||
/// let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
|
||||
///
|
||||
/// let private_compression_key =
|
||||
/// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
|
||||
/// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
|
||||
///
|
||||
/// let streams = CudaStreams::new_multi_gpu();
|
||||
///
|
||||
/// let num_blocks = 32;
|
||||
/// let (radix_cks, _) = gen_keys_radix_gpu(
|
||||
/// PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
/// PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
/// num_blocks,
|
||||
/// &streams,
|
||||
/// );
|
||||
@@ -268,19 +268,19 @@ impl CompressedCiphertextList {
|
||||
/// use tfhe::integer::gpu::ciphertext::{CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext};
|
||||
/// use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
/// use tfhe::integer::gpu::gen_keys_radix_gpu;
|
||||
/// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
|
||||
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
|
||||
/// use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
/// use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
///
|
||||
/// let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
|
||||
/// let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
|
||||
///
|
||||
/// let private_compression_key =
|
||||
/// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
|
||||
/// cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
|
||||
///
|
||||
/// let streams = CudaStreams::new_multi_gpu();
|
||||
///
|
||||
/// let num_blocks = 32;
|
||||
/// let (radix_cks, _) = gen_keys_radix_gpu(
|
||||
/// PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
/// PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
/// num_blocks,
|
||||
/// &streams,
|
||||
/// );
|
||||
@@ -514,8 +514,8 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::integer::gpu::gen_keys_radix_gpu;
|
||||
use crate::integer::ClientKey;
|
||||
use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
|
||||
use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64;
|
||||
use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
|
||||
use rand::Rng;
|
||||
|
||||
const NB_TESTS: usize = 10;
|
||||
@@ -523,16 +523,16 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_gpu_ciphertext_compression() {
|
||||
let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
|
||||
let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
|
||||
|
||||
let private_compression_key =
|
||||
cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64);
|
||||
cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
|
||||
|
||||
let streams = CudaStreams::new_multi_gpu();
|
||||
|
||||
let num_blocks = 32;
|
||||
let (radix_cks, _) = gen_keys_radix_gpu(
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_GAUSSIAN_2M64,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
|
||||
num_blocks,
|
||||
&streams,
|
||||
);
|
||||
@@ -575,7 +575,7 @@ mod tests {
|
||||
.unwrap();
|
||||
let decompressed = d_decompressed.to_radix_ciphertext(&streams);
|
||||
let decrypted: u128 = radix_cks.decrypt(&decompressed);
|
||||
assert_eq!(decrypted, *message);
|
||||
assert_eq!(decrypted, *message, "nb messages {nb_messages}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -610,7 +610,7 @@ mod tests {
|
||||
.unwrap();
|
||||
let decompressed = d_decompressed.to_signed_radix_ciphertext(&streams);
|
||||
let decrypted: i128 = radix_cks.decrypt_signed(&decompressed);
|
||||
assert_eq!(decrypted, *message);
|
||||
assert_eq!(decrypted, *message, "nb messages {nb_messages}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -644,7 +644,7 @@ mod tests {
|
||||
.unwrap();
|
||||
let decompressed = d_decompressed.to_boolean_block(&streams);
|
||||
let decrypted = radix_cks.decrypt_bool(&decompressed);
|
||||
assert_eq!(decrypted, *message);
|
||||
assert_eq!(decrypted, *message, "nb messages {nb_messages}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -705,7 +705,7 @@ mod tests {
|
||||
.unwrap();
|
||||
let decompressed = d_decompressed.to_radix_ciphertext(&streams);
|
||||
let decrypted: u128 = radix_cks.decrypt(&decompressed);
|
||||
assert_eq!(decrypted, *message);
|
||||
assert_eq!(decrypted, *message, "nb messages {nb_messages}");
|
||||
}
|
||||
MessageType::Signed(message) => {
|
||||
let d_decompressed: CudaSignedRadixCiphertext = cuda_compressed
|
||||
@@ -714,7 +714,7 @@ mod tests {
|
||||
.unwrap();
|
||||
let decompressed = d_decompressed.to_signed_radix_ciphertext(&streams);
|
||||
let decrypted: i128 = radix_cks.decrypt_signed(&decompressed);
|
||||
assert_eq!(decrypted, *message);
|
||||
assert_eq!(decrypted, *message, "nb messages {nb_messages}");
|
||||
}
|
||||
MessageType::Boolean(message) => {
|
||||
let d_decompressed: CudaBooleanBlock = cuda_compressed
|
||||
@@ -723,7 +723,7 @@ mod tests {
|
||||
.unwrap();
|
||||
let decompressed = d_decompressed.to_boolean_block(&streams);
|
||||
let decrypted = radix_cks.decrypt_bool(&decompressed);
|
||||
assert_eq!(decrypted, *message);
|
||||
assert_eq!(decrypted, *message, "nb messages {nb_messages}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,32 +92,27 @@ impl CudaCompressionKey {
|
||||
let lwe_ciphertext_count = LweCiphertextCount(total_num_blocks);
|
||||
|
||||
let gpu_index = streams.gpu_indexes[0];
|
||||
let d_vec = unsafe {
|
||||
let mut d_vec = CudaVec::new_async(
|
||||
lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0,
|
||||
streams,
|
||||
gpu_index,
|
||||
let mut d_vec = CudaVec::new_async(
|
||||
lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0,
|
||||
streams,
|
||||
gpu_index,
|
||||
);
|
||||
let mut offset: usize = 0;
|
||||
for ciphertext in vec_ciphertexts {
|
||||
let dest_ptr = d_vec
|
||||
.as_mut_c_ptr(gpu_index)
|
||||
.add(offset * std::mem::size_of::<u64>());
|
||||
let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
dest_ptr,
|
||||
ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index),
|
||||
size as u64,
|
||||
streams.ptr[gpu_index as usize],
|
||||
streams.gpu_indexes[gpu_index as usize],
|
||||
);
|
||||
let mut offset: usize = 0;
|
||||
for ciphertext in vec_ciphertexts {
|
||||
let dest_ptr = d_vec
|
||||
.as_mut_c_ptr(gpu_index)
|
||||
.add(offset * std::mem::size_of::<u64>());
|
||||
let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
|
||||
cuda_memcpy_async_gpu_to_gpu(
|
||||
dest_ptr,
|
||||
ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index),
|
||||
size as u64,
|
||||
streams.ptr[gpu_index as usize],
|
||||
streams.gpu_indexes[gpu_index as usize],
|
||||
);
|
||||
|
||||
offset += ciphertext.d_blocks.0.d_vec.len;
|
||||
}
|
||||
|
||||
streams.synchronize();
|
||||
d_vec
|
||||
};
|
||||
offset += ciphertext.d_blocks.0.d_vec.len;
|
||||
}
|
||||
|
||||
CudaLweCiphertextList::from_cuda_vec(d_vec, lwe_ciphertext_count, ciphertext_modulus)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user