mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-10 07:08:03 -05:00
refactor(gpu): cleaning compression
This commit is contained in:
committed by
enzodimaria
parent
46a7229c81
commit
d1c417bf71
@@ -3,6 +3,26 @@
|
||||
|
||||
#include "../../pbs/pbs_enums.h"
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t lwe_dimension;
|
||||
} CudaLweCiphertextListFFI;
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
// Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
|
||||
// smaller)
|
||||
// Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
|
||||
// each LWE of the group). In the end the total number of bodies is equal to
|
||||
// the number of input LWE
|
||||
uint32_t total_lwe_bodies_count;
|
||||
uint32_t glwe_dimension;
|
||||
uint32_t polynomial_size;
|
||||
} CudaPackedGlweCiphertextListFFI;
|
||||
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
@@ -10,28 +30,29 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory);
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
|
||||
|
||||
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
|
||||
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array);
|
||||
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
|
||||
uint32_t num_nths, int8_t *mem_ptr);
|
||||
CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
|
||||
int8_t *mem_ptr);
|
||||
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
|
||||
uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr);
|
||||
CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *glwe_in,
|
||||
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
|
||||
@@ -5,11 +5,6 @@
|
||||
|
||||
template <typename Torus> struct int_compression {
|
||||
int_radix_params compression_params;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
|
||||
uint32_t body_count;
|
||||
|
||||
// Compression
|
||||
int8_t *fp_ks_buffer;
|
||||
Torus *tmp_lwe;
|
||||
@@ -19,13 +14,9 @@ template <typename Torus> struct int_compression {
|
||||
int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->compression_params = compression_params;
|
||||
this->lwe_per_glwe = lwe_per_glwe;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->body_count = num_radix_blocks;
|
||||
|
||||
uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
@@ -58,11 +49,7 @@ template <typename Torus> struct int_compression {
|
||||
template <typename Torus> struct int_decompression {
|
||||
int_radix_params encryption_params;
|
||||
int_radix_params compression_params;
|
||||
|
||||
uint32_t storage_log_modulus;
|
||||
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t body_count;
|
||||
uint32_t num_blocks_to_decompress;
|
||||
|
||||
Torus *tmp_extracted_glwe;
|
||||
Torus *tmp_extracted_lwe;
|
||||
@@ -74,15 +61,12 @@ template <typename Torus> struct int_decompression {
|
||||
int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params encryption_params,
|
||||
int_radix_params compression_params,
|
||||
uint32_t num_radix_blocks, uint32_t body_count,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory,
|
||||
uint32_t num_blocks_to_decompress, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->encryption_params = encryption_params;
|
||||
this->compression_params = compression_params;
|
||||
this->storage_log_modulus = storage_log_modulus;
|
||||
this->num_radix_blocks = num_radix_blocks;
|
||||
this->body_count = body_count;
|
||||
this->num_blocks_to_decompress = num_blocks_to_decompress;
|
||||
|
||||
uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
@@ -90,18 +74,18 @@ template <typename Torus> struct int_decompression {
|
||||
compression_params.polynomial_size +
|
||||
1);
|
||||
decompression_rescale_lut = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
streams, gpu_indexes, gpu_count, encryption_params, 1,
|
||||
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
|
||||
|
||||
tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0],
|
||||
num_blocks_to_decompress * sizeof(uint32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
|
||||
// Rescale is done using an identity LUT
|
||||
// Here we do not divide by message_modulus
|
||||
|
||||
@@ -123,7 +123,6 @@ __host__ void host_modulus_switch_inplace(cudaStream_t stream,
|
||||
|
||||
int num_threads = 0, num_blocks = 0;
|
||||
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
|
||||
|
||||
modulus_switch_inplace<Torus>
|
||||
<<<num_blocks, num_threads, 0, stream>>>(array, size, log_modulus);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
@@ -6,8 +6,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
|
||||
|
||||
int_radix_params compression_params(
|
||||
pbs_type, compression_glwe_dimension, compression_polynomial_size,
|
||||
@@ -18,17 +17,16 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
return scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
|
||||
compression_params, lwe_per_glwe, storage_log_modulus,
|
||||
allocate_gpu_memory);
|
||||
compression_params, lwe_per_glwe, allocate_gpu_memory);
|
||||
}
|
||||
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
|
||||
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
|
||||
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log,
|
||||
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
|
||||
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
|
||||
bool allocate_ms_array) {
|
||||
|
||||
// Decompression doesn't keyswitch, so big and small dimensions are the same
|
||||
@@ -45,31 +43,29 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
|
||||
return scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
(int_decompression<uint64_t> **)mem_ptr, num_radix_blocks, body_count,
|
||||
encryption_params, compression_params, storage_log_modulus,
|
||||
allocate_gpu_memory);
|
||||
(int_decompression<uint64_t> **)mem_ptr, num_blocks_to_decompress,
|
||||
encryption_params, compression_params, allocate_gpu_memory);
|
||||
}
|
||||
void cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
|
||||
uint32_t num_nths, int8_t *mem_ptr) {
|
||||
CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
|
||||
int8_t *mem_ptr) {
|
||||
|
||||
host_integer_compress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(glwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_array_in), (uint64_t *const *)(fp_ksk),
|
||||
num_nths, (int_compression<uint64_t> *)mem_ptr);
|
||||
host_integer_compress<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
|
||||
gpu_count, glwe_array_out, lwe_array_in,
|
||||
(uint64_t *const *)(fp_ksk),
|
||||
(int_compression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
void cuda_integer_decompress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
|
||||
uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr) {
|
||||
CudaLweCiphertextListFFI *lwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *glwe_in,
|
||||
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr) {
|
||||
|
||||
host_integer_decompress<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
|
||||
static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(glwe_in), indexes_array, indexes_array_size,
|
||||
bsks, (int_decompression<uint64_t> *)mem_ptr);
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, glwe_in,
|
||||
indexes_array, bsks, (int_decompression<uint64_t> *)mem_ptr);
|
||||
}
|
||||
|
||||
void cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
|
||||
@@ -48,19 +48,19 @@ __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
|
||||
/// implementation
|
||||
template <typename Torus>
|
||||
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *array_out, Torus *array_in, uint32_t num_glwes,
|
||||
uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
|
||||
if (array_in == array_out)
|
||||
CudaPackedGlweCiphertextListFFI *array_out,
|
||||
Torus *array_in, uint32_t num_glwes,
|
||||
int_radix_params compression_params) {
|
||||
if (array_in == (Torus *)array_out->ptr)
|
||||
PANIC("Cuda error: Input and output must be different");
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
auto log_modulus = array_out->storage_log_modulus;
|
||||
// [0..num_glwes-1) GLWEs
|
||||
auto in_len = num_glwes * compression_params.glwe_dimension *
|
||||
compression_params.polynomial_size +
|
||||
num_lwes;
|
||||
array_out->total_lwe_bodies_count;
|
||||
|
||||
auto number_bits_to_pack = in_len * log_modulus;
|
||||
|
||||
@@ -73,18 +73,18 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
|
||||
|
||||
dim3 grid(num_blocks);
|
||||
dim3 threads(num_threads);
|
||||
pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
|
||||
out_len, in_len, out_len);
|
||||
pack<Torus><<<grid, threads, 0, stream>>>(
|
||||
(Torus *)array_out->ptr, array_in, log_modulus, out_len, in_len, out_len);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void
|
||||
host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *glwe_array_out,
|
||||
Torus const *lwe_array_in, Torus *const *fp_ksk,
|
||||
uint32_t num_radix_blocks,
|
||||
int_compression<Torus> *mem_ptr) {
|
||||
uint32_t gpu_count,
|
||||
CudaPackedGlweCiphertextListFFI *glwe_array_out,
|
||||
CudaLweCiphertextListFFI const *lwe_array_in,
|
||||
Torus *const *fp_ksk, int_compression<Torus> *mem_ptr) {
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
auto input_lwe_dimension = compression_params.small_lwe_dimension;
|
||||
@@ -93,14 +93,14 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
auto lwe_shifted = mem_ptr->tmp_lwe;
|
||||
host_cleartext_multiplication<Torus>(
|
||||
streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
|
||||
(uint64_t)compression_params.message_modulus, input_lwe_dimension,
|
||||
num_radix_blocks);
|
||||
(uint64_t)compression_params.message_modulus);
|
||||
|
||||
uint32_t lwe_in_size = input_lwe_dimension + 1;
|
||||
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
uint32_t num_glwes =
|
||||
(num_radix_blocks + mem_ptr->lwe_per_glwe - 1) / mem_ptr->lwe_per_glwe;
|
||||
uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
|
||||
glwe_array_out->lwe_per_glwe - 1) /
|
||||
glwe_array_out->lwe_per_glwe;
|
||||
|
||||
// Keyswitch LWEs to GLWE
|
||||
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
|
||||
@@ -109,12 +109,13 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
compression_params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
|
||||
auto rem_lwes = num_radix_blocks;
|
||||
auto rem_lwes = glwe_array_out->total_lwe_bodies_count;
|
||||
|
||||
auto lwe_subset = lwe_shifted;
|
||||
auto glwe_out = tmp_glwe_array_out;
|
||||
|
||||
while (rem_lwes > 0) {
|
||||
auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);
|
||||
auto chunk_size = min(rem_lwes, glwe_array_out->lwe_per_glwe);
|
||||
|
||||
host_packing_keyswitch_lwe_list_to_glwe<Torus>(
|
||||
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
|
||||
@@ -128,15 +129,16 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
}
|
||||
|
||||
// Modulus switch
|
||||
host_modulus_switch_inplace<Torus>(
|
||||
streams[0], gpu_indexes[0], tmp_glwe_array_out,
|
||||
num_glwes * compression_params.glwe_dimension *
|
||||
compression_params.polynomial_size +
|
||||
num_radix_blocks,
|
||||
mem_ptr->storage_log_modulus);
|
||||
int size = num_glwes * compression_params.glwe_dimension *
|
||||
compression_params.polynomial_size +
|
||||
glwe_array_out->total_lwe_bodies_count;
|
||||
|
||||
host_modulus_switch_inplace<Torus>(streams[0], gpu_indexes[0],
|
||||
tmp_glwe_array_out, size,
|
||||
glwe_array_out->storage_log_modulus);
|
||||
|
||||
host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
|
||||
tmp_glwe_array_out, num_glwes, num_radix_blocks, mem_ptr);
|
||||
tmp_glwe_array_out, num_glwes, compression_params);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -176,34 +178,38 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
|
||||
/// This function follows the naming used in the CPU implementation
|
||||
template <typename Torus>
|
||||
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
Torus *glwe_array_out, Torus const *array_in,
|
||||
uint32_t glwe_index,
|
||||
int_decompression<Torus> *mem_ptr) {
|
||||
if (array_in == glwe_array_out)
|
||||
Torus *glwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *array_in,
|
||||
uint32_t glwe_index) {
|
||||
if ((Torus *)array_in->ptr == glwe_array_out)
|
||||
PANIC("Cuda error: Input and output must be different");
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
|
||||
auto compression_params = mem_ptr->compression_params;
|
||||
auto log_modulus = mem_ptr->storage_log_modulus;
|
||||
auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
|
||||
compression_params.polynomial_size;
|
||||
auto log_modulus = array_in->storage_log_modulus;
|
||||
auto total_lwe_bodies_count = array_in->total_lwe_bodies_count;
|
||||
auto polynomial_size = array_in->polynomial_size;
|
||||
auto glwe_dimension = array_in->glwe_dimension;
|
||||
|
||||
uint32_t body_count = mem_ptr->body_count;
|
||||
auto num_glwes = (body_count + compression_params.polynomial_size - 1) /
|
||||
compression_params.polynomial_size;
|
||||
auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
|
||||
|
||||
uint32_t num_glwes =
|
||||
(total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
|
||||
|
||||
// Compressed length of the compressed GLWE we want to extract
|
||||
if (mem_ptr->body_count % compression_params.polynomial_size == 0)
|
||||
body_count = compression_params.polynomial_size;
|
||||
else if (glwe_index == num_glwes - 1)
|
||||
body_count = mem_ptr->body_count % compression_params.polynomial_size;
|
||||
else
|
||||
body_count = compression_params.polynomial_size;
|
||||
uint32_t body_count = 0;
|
||||
if (glwe_index == num_glwes - 1) {
|
||||
auto remainder = total_lwe_bodies_count % polynomial_size;
|
||||
if (remainder == 0) {
|
||||
body_count = polynomial_size;
|
||||
} else {
|
||||
body_count = remainder;
|
||||
}
|
||||
} else {
|
||||
body_count = polynomial_size;
|
||||
}
|
||||
|
||||
auto initial_out_len =
|
||||
compression_params.glwe_dimension * compression_params.polynomial_size +
|
||||
body_count;
|
||||
uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;
|
||||
|
||||
// Calculates how many bits this particular GLWE shall use
|
||||
auto number_bits_to_unpack = initial_out_len * log_modulus;
|
||||
@@ -213,7 +219,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
|
||||
auto len = (number_bits_to_unpack + nbits - 1) / nbits;
|
||||
// Uses that length to set the input pointer
|
||||
auto chunk_array_in = array_in + glwe_index * len;
|
||||
auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;
|
||||
|
||||
// Ensure the tail of the GLWE is zeroed
|
||||
if (initial_out_len < glwe_ciphertext_size) {
|
||||
@@ -232,25 +238,24 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_integer_decompress(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, Torus *d_lwe_array_out, Torus const *d_packed_glwe_in,
|
||||
uint32_t const *h_indexes_array, uint32_t indexes_array_size,
|
||||
void *const *d_bsks, int_decompression<Torus> *h_mem_ptr) {
|
||||
__host__ void
|
||||
host_integer_decompress(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaLweCiphertextListFFI *d_lwe_array_out,
|
||||
CudaPackedGlweCiphertextListFFI const *d_packed_glwe_in,
|
||||
uint32_t const *h_indexes_array, void *const *d_bsks,
|
||||
int_decompression<Torus> *h_mem_ptr) {
|
||||
|
||||
auto num_blocks_to_decompress = h_mem_ptr->num_blocks_to_decompress;
|
||||
|
||||
auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
|
||||
cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array,
|
||||
indexes_array_size * sizeof(uint32_t), streams[0],
|
||||
gpu_indexes[0]);
|
||||
num_blocks_to_decompress * sizeof(uint32_t),
|
||||
streams[0], gpu_indexes[0]);
|
||||
|
||||
auto compression_params = h_mem_ptr->compression_params;
|
||||
auto lwe_per_glwe = compression_params.polynomial_size;
|
||||
|
||||
auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
|
||||
if (num_radix_blocks != indexes_array_size)
|
||||
PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs "
|
||||
"should be the same as indexes_array_size.")
|
||||
|
||||
// the first element is the number of LWEs that lies in the related GLWE
|
||||
std::vector<std::pair<int, Torus *>> glwe_vec;
|
||||
|
||||
@@ -261,16 +266,16 @@ __host__ void host_integer_decompress(
|
||||
auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
|
||||
auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
|
||||
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
|
||||
d_packed_glwe_in, current_glwe_index, h_mem_ptr);
|
||||
d_packed_glwe_in, current_glwe_index);
|
||||
glwe_vec.push_back(std::make_pair(1, extracted_glwe));
|
||||
for (int i = 1; i < indexes_array_size; i++) {
|
||||
for (int i = 1; i < num_blocks_to_decompress; i++) {
|
||||
auto glwe_index = h_indexes_array[i] / lwe_per_glwe;
|
||||
if (glwe_index != current_glwe_index) {
|
||||
extracted_glwe += glwe_accumulator_size;
|
||||
current_glwe_index = glwe_index;
|
||||
// Extracts a new GLWE
|
||||
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
|
||||
d_packed_glwe_in, glwe_index, h_mem_ptr);
|
||||
d_packed_glwe_in, glwe_index);
|
||||
glwe_vec.push_back(std::make_pair(1, extracted_glwe));
|
||||
} else {
|
||||
// Updates the quantity
|
||||
@@ -306,17 +311,19 @@ __host__ void host_integer_decompress(
|
||||
/// dimension to a big LWE dimension
|
||||
auto encryption_params = h_mem_ptr->encryption_params;
|
||||
auto lut = h_mem_ptr->decompression_rescale_lut;
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
|
||||
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
|
||||
lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
|
||||
encryption_params.glwe_dimension,
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
|
||||
num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
} else {
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
@@ -332,7 +339,7 @@ __host__ void host_integer_decompress(
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
|
||||
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->active_gpu_count, num_radix_blocks,
|
||||
lut->active_gpu_count, num_blocks_to_decompress,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
|
||||
/// Apply PBS
|
||||
@@ -344,13 +351,14 @@ __host__ void host_integer_decompress(
|
||||
compression_params.small_lwe_dimension,
|
||||
encryption_params.polynomial_size, encryption_params.pbs_base_log,
|
||||
encryption_params.pbs_level, encryption_params.grouping_factor,
|
||||
num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
|
||||
num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
|
||||
lut_stride);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, num_radix_blocks,
|
||||
lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
@@ -365,13 +373,12 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_compression<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, int_radix_params compression_params,
|
||||
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
|
||||
bool allocate_gpu_memory) {
|
||||
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_compression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
|
||||
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
|
||||
lwe_per_glwe, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
@@ -379,15 +386,13 @@ template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_decompression<Torus> **mem_ptr,
|
||||
uint32_t num_radix_blocks, uint32_t body_count,
|
||||
int_radix_params encryption_params, int_radix_params compression_params,
|
||||
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
|
||||
uint32_t num_blocks_to_decompress, int_radix_params encryption_params,
|
||||
int_radix_params compression_params, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_decompression<Torus>(
|
||||
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
|
||||
num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -13,6 +13,8 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
#include "integer/compression/compression.h"
|
||||
|
||||
template <typename T>
|
||||
__global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
|
||||
T const *cleartext_input,
|
||||
@@ -51,9 +53,9 @@ __host__ void host_cleartext_vec_multiplication(
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension, uint32_t num_entries) {
|
||||
__global__ void cleartext_multiplication(T *output, T const *lwe_input,
|
||||
T cleartext_input,
|
||||
const uint32_t num_entries) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
int index = blockIdx.x * blockDim.x + tid;
|
||||
@@ -64,25 +66,21 @@ cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void
|
||||
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
|
||||
T *output, T const *lwe_input, T cleartext_input,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
__host__ void host_cleartext_multiplication(
|
||||
cudaStream_t stream, uint32_t gpu_index, T *output,
|
||||
CudaLweCiphertextListFFI const *lwe_input, T cleartext_input) {
|
||||
|
||||
cuda_set_device(gpu_index);
|
||||
// lwe_size includes the presence of the body
|
||||
// whereas lwe_dimension is the number of elements in the mask
|
||||
int lwe_size = input_lwe_dimension + 1;
|
||||
// Create a 1-dimensional grid of threads
|
||||
int num_blocks = 0, num_threads = 0;
|
||||
int num_entries = input_lwe_ciphertext_count * lwe_size;
|
||||
uint32_t num_entries =
|
||||
lwe_input->num_radix_blocks * (lwe_input->lwe_dimension + 1);
|
||||
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
|
||||
dim3 grid(num_blocks, 1, 1);
|
||||
dim3 thds(num_threads, 1, 1);
|
||||
|
||||
cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
|
||||
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
|
||||
output, (T *)lwe_input->ptr, cleartext_input, num_entries);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
|
||||
@@ -105,6 +105,55 @@ const _: () = {
|
||||
ms_input_variance
|
||||
) - 32usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaLweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub num_radix_blocks: u32,
|
||||
pub lwe_dimension: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
|
||||
["Alignment of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaPackedGlweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub storage_log_modulus: u32,
|
||||
pub lwe_per_glwe: u32,
|
||||
pub total_lwe_bodies_count: u32,
|
||||
pub glwe_dimension: u32,
|
||||
pub polynomial_size: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
|
||||
["Alignment of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
|
||||
};
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
@@ -121,7 +170,6 @@ unsafe extern "C" {
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
lwe_per_glwe: u32,
|
||||
storage_log_modulus: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> u64;
|
||||
}
|
||||
@@ -138,12 +186,10 @@ unsafe extern "C" {
|
||||
lwe_dimension: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
num_radix_blocks: u32,
|
||||
num_blocks_to_decompress: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
storage_log_modulus: u32,
|
||||
body_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
@@ -153,10 +199,9 @@ unsafe extern "C" {
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
glwe_array_out: *mut ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
|
||||
lwe_array_in: *const CudaLweCiphertextListFFI,
|
||||
fp_ksk: *const *mut ffi::c_void,
|
||||
num_nths: u32,
|
||||
mem_ptr: *mut i8,
|
||||
);
|
||||
}
|
||||
@@ -165,10 +210,9 @@ unsafe extern "C" {
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
glwe_in: *const ffi::c_void,
|
||||
lwe_array_out: *mut CudaLweCiphertextListFFI,
|
||||
glwe_in: *const CudaPackedGlweCiphertextListFFI,
|
||||
indexes_array: *const u32,
|
||||
indexes_array_size: u32,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
mem_ptr: *mut i8,
|
||||
);
|
||||
|
||||
@@ -261,7 +261,7 @@ impl CudaCompressedCiphertextList {
|
||||
v
|
||||
};
|
||||
|
||||
let mut num_bodies_left = gpu_meta.bodies_count;
|
||||
let mut num_bodies_left = gpu_meta.total_lwe_bodies_count;
|
||||
let mut chunk_start = 0;
|
||||
while num_bodies_left != 0 {
|
||||
let bodies_count = LweCiphertextCount(num_bodies_left.min(lwe_per_glwe.0));
|
||||
@@ -427,7 +427,7 @@ impl CompressedCiphertextList {
|
||||
ciphertext_modulus: cpu_meta.ciphertext_modulus,
|
||||
storage_log_modulus: first_ct.packed_integers().log_modulus(),
|
||||
lwe_per_glwe,
|
||||
bodies_count: self.packed_list.len(),
|
||||
total_lwe_bodies_count: self.packed_list.len(),
|
||||
initial_len,
|
||||
})
|
||||
});
|
||||
|
||||
@@ -49,7 +49,7 @@ pub struct CudaPackedGlweCiphertextListMeta {
|
||||
pub storage_log_modulus: CiphertextModulusLog,
|
||||
pub lwe_per_glwe: LweCiphertextCount,
|
||||
// Number of lwe bodies that are compressed in this list
|
||||
pub bodies_count: usize,
|
||||
pub total_lwe_bodies_count: usize,
|
||||
// Number of elements (u64) the uncompressed GLWE list had
|
||||
// keep in mind the last GLWE may not be full
|
||||
pub initial_len: usize,
|
||||
@@ -69,7 +69,9 @@ impl CudaPackedGlweCiphertextList {
|
||||
|
||||
pub fn bodies_count(&self) -> usize {
|
||||
// If there is no metadata, the list is empty
|
||||
self.meta.map(|meta| meta.bodies_count).unwrap_or_default()
|
||||
self.meta
|
||||
.map(|meta| meta.total_lwe_bodies_count)
|
||||
.unwrap_or_default()
|
||||
}
|
||||
|
||||
pub fn glwe_ciphertext_count(&self) -> GlweCiphertextCount {
|
||||
@@ -180,7 +182,7 @@ impl CudaCompressionKey {
|
||||
let uncompressed_len = num_glwes * glwe_mask_size + num_lwes;
|
||||
let number_bits_to_pack = uncompressed_len * self.storage_log_modulus.0;
|
||||
let compressed_len = number_bits_to_pack.div_ceil(u64::BITS as usize);
|
||||
let mut packed_glwe_list = CudaVec::new(compressed_len, streams, 0);
|
||||
let packed_glwe_list = CudaVec::new(compressed_len, streams, 0);
|
||||
|
||||
if ciphertexts.is_empty() {
|
||||
return CudaPackedGlweCiphertextList {
|
||||
@@ -194,16 +196,30 @@ impl CudaCompressionKey {
|
||||
let first_ct_info = first_ct.info.blocks.first().unwrap();
|
||||
let message_modulus = first_ct_info.message_modulus;
|
||||
let carry_modulus = first_ct_info.carry_modulus;
|
||||
|
||||
let lwe_dimension = first_ct.d_blocks.lwe_dimension();
|
||||
|
||||
let mut glwe_array_out = CudaPackedGlweCiphertextList {
|
||||
data: packed_glwe_list,
|
||||
meta: Some(CudaPackedGlweCiphertextListMeta {
|
||||
glwe_dimension: compressed_glwe_size.to_glwe_dimension(),
|
||||
polynomial_size: compressed_polynomial_size,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
ciphertext_modulus,
|
||||
storage_log_modulus: self.storage_log_modulus,
|
||||
lwe_per_glwe: LweCiphertextCount(compressed_polynomial_size.0),
|
||||
total_lwe_bodies_count: num_lwes,
|
||||
initial_len: uncompressed_len,
|
||||
}),
|
||||
};
|
||||
|
||||
unsafe {
|
||||
let input_lwes = Self::flatten_async(ciphertexts, streams);
|
||||
|
||||
compress_integer_radix_async(
|
||||
streams,
|
||||
&mut packed_glwe_list,
|
||||
&input_lwes.0.d_vec,
|
||||
&mut glwe_array_out,
|
||||
&input_lwes,
|
||||
&self.packing_key_switching_key.d_vec,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
@@ -213,29 +229,13 @@ impl CudaCompressionKey {
|
||||
lwe_pksk.decomposition_base_log(),
|
||||
lwe_pksk.decomposition_level_count(),
|
||||
self.lwe_per_glwe.0 as u32,
|
||||
self.storage_log_modulus.0 as u32,
|
||||
num_lwes as u32,
|
||||
);
|
||||
|
||||
streams.synchronize();
|
||||
};
|
||||
|
||||
let meta = Some(CudaPackedGlweCiphertextListMeta {
|
||||
glwe_dimension: compressed_glwe_size.to_glwe_dimension(),
|
||||
polynomial_size: compressed_polynomial_size,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
ciphertext_modulus,
|
||||
storage_log_modulus: self.storage_log_modulus,
|
||||
lwe_per_glwe: LweCiphertextCount(compressed_polynomial_size.0),
|
||||
bodies_count: num_lwes,
|
||||
initial_len: uncompressed_len,
|
||||
});
|
||||
|
||||
CudaPackedGlweCiphertextList {
|
||||
data: packed_glwe_list,
|
||||
meta,
|
||||
}
|
||||
glwe_array_out
|
||||
}
|
||||
pub fn get_compression_size_on_gpu(
|
||||
&self,
|
||||
@@ -259,7 +259,6 @@ impl CudaCompressionKey {
|
||||
lwe_pksk.decomposition_base_log(),
|
||||
lwe_pksk.decomposition_level_count(),
|
||||
self.lwe_per_glwe.0 as u32,
|
||||
self.storage_log_modulus.0 as u32,
|
||||
num_lwes,
|
||||
)
|
||||
}
|
||||
@@ -308,7 +307,6 @@ impl CudaDecompressionKey {
|
||||
let message_modulus = self.message_modulus;
|
||||
let carry_modulus = self.carry_modulus;
|
||||
let ciphertext_modulus = self.ciphertext_modulus;
|
||||
let storage_log_modulus = meta.storage_log_modulus;
|
||||
|
||||
match &self.blind_rotate_key {
|
||||
CudaBootstrappingKey::Classic(bsk) => {
|
||||
@@ -328,10 +326,9 @@ impl CudaDecompressionKey {
|
||||
unsafe {
|
||||
decompress_integer_radix_async(
|
||||
streams,
|
||||
&mut output_lwe.0.d_vec,
|
||||
&packed_list.data,
|
||||
&mut output_lwe,
|
||||
packed_list,
|
||||
&bsk.d_vec,
|
||||
meta.bodies_count as u32,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
encryption_glwe_dimension,
|
||||
@@ -341,7 +338,6 @@ impl CudaDecompressionKey {
|
||||
lwe_dimension,
|
||||
bsk.decomp_base_log(),
|
||||
bsk.decomp_level_count(),
|
||||
storage_log_modulus.0 as u32,
|
||||
indexes_array.as_slice(),
|
||||
indexes_array_len.0 as u32,
|
||||
);
|
||||
@@ -403,7 +399,6 @@ impl CudaDecompressionKey {
|
||||
|
||||
let message_modulus = self.message_modulus;
|
||||
let carry_modulus = self.carry_modulus;
|
||||
let storage_log_modulus = meta.storage_log_modulus;
|
||||
|
||||
match &self.blind_rotate_key {
|
||||
CudaBootstrappingKey::Classic(bsk) => {
|
||||
@@ -415,7 +410,6 @@ impl CudaDecompressionKey {
|
||||
|
||||
get_decompression_size_on_gpu(
|
||||
streams,
|
||||
packed_list.bodies_count() as u32,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
encryption_glwe_dimension,
|
||||
@@ -425,7 +419,6 @@ impl CudaDecompressionKey {
|
||||
lwe_dimension,
|
||||
bsk.decomp_base_log(),
|
||||
bsk.decomp_level_count(),
|
||||
storage_log_modulus.0 as u32,
|
||||
indexes_array_len.0 as u32,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -21,6 +21,7 @@ use crate::core_crypto::prelude::{
|
||||
use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
|
||||
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, KsType};
|
||||
use crate::integer::gpu::list_compression::server_keys::CudaPackedGlweCiphertextList;
|
||||
use crate::integer::server_key::radix_parallel::scalar_div_mod::{
|
||||
choose_multiplier, SignedReciprocable,
|
||||
};
|
||||
@@ -100,6 +101,29 @@ pub fn prepare_default_scalar_divisor() -> CudaScalarDivisorFFI {
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_cuda_lwe_ct_ffi<T: UnsignedInteger>(
|
||||
input: &CudaLweCiphertextList<T>,
|
||||
) -> CudaLweCiphertextListFFI {
|
||||
CudaLweCiphertextListFFI {
|
||||
ptr: input.0.d_vec.get_mut_c_ptr(0),
|
||||
num_radix_blocks: input.0.lwe_ciphertext_count.0 as u32,
|
||||
lwe_dimension: input.0.lwe_dimension.0 as u32,
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_cuda_packed_glwe_ct_ffi(
|
||||
input: &CudaPackedGlweCiphertextList,
|
||||
) -> CudaPackedGlweCiphertextListFFI {
|
||||
CudaPackedGlweCiphertextListFFI {
|
||||
ptr: input.data.get_mut_c_ptr(0),
|
||||
storage_log_modulus: input.meta.unwrap().storage_log_modulus.0 as u32,
|
||||
lwe_per_glwe: input.meta.unwrap().lwe_per_glwe.0 as u32,
|
||||
total_lwe_bodies_count: input.meta.unwrap().total_lwe_bodies_count as u32,
|
||||
glwe_dimension: input.meta.unwrap().glwe_dimension.0 as u32,
|
||||
polynomial_size: input.meta.unwrap().polynomial_size.0 as u32,
|
||||
}
|
||||
}
|
||||
|
||||
// If we build the Vec<u64> inside prepare_cuda_radix_ffi
|
||||
// the data gets dropped before the call to the Cuda function,
|
||||
// and we get memory errors, hence why the reconstruction of
|
||||
@@ -711,8 +735,8 @@ where
|
||||
/// is required
|
||||
pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
glwe_array_out: &mut CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<T>,
|
||||
glwe_array_out: &mut CudaPackedGlweCiphertextList,
|
||||
lwe_array_in: &CudaLweCiphertextList<T>,
|
||||
fp_keyswitch_key: &CudaVec<u64>,
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
@@ -722,23 +746,8 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
|
||||
ks_base_log: DecompositionBaseLog,
|
||||
ks_level: DecompositionLevelCount,
|
||||
lwe_per_glwe: u32,
|
||||
storage_log_modulus: u32,
|
||||
num_blocks: u32,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
glwe_array_out.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first glwe output pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
glwe_array_out.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
lwe_array_in.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_in.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
fp_keyswitch_key.gpu_index(0),
|
||||
@@ -746,7 +755,25 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
|
||||
streams.gpu_indexes[0].get(),
|
||||
fp_keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
lwe_array_in.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_in.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
glwe_array_out.data.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
glwe_array_out.data.gpu_index(0).get(),
|
||||
);
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
|
||||
let array_in_ffi = prepare_cuda_lwe_ct_ffi(lwe_array_in);
|
||||
let mut glwe_array_out_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_array_out);
|
||||
|
||||
scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams.ptr.as_ptr(),
|
||||
streams.gpu_indexes_ptr(),
|
||||
@@ -762,7 +789,6 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
|
||||
carry_modulus.0 as u32,
|
||||
PBSType::Classical as u32,
|
||||
lwe_per_glwe,
|
||||
storage_log_modulus,
|
||||
true,
|
||||
);
|
||||
|
||||
@@ -770,10 +796,9 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
|
||||
streams.ptr.as_ptr(),
|
||||
streams.gpu_indexes_ptr(),
|
||||
streams.len() as u32,
|
||||
glwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
&raw mut glwe_array_out_ffi,
|
||||
&raw const array_in_ffi,
|
||||
fp_keyswitch_key.ptr.as_ptr(),
|
||||
num_blocks,
|
||||
mem_ptr,
|
||||
);
|
||||
|
||||
@@ -796,7 +821,6 @@ pub fn get_compression_size_on_gpu(
|
||||
ks_base_log: DecompositionBaseLog,
|
||||
ks_level: DecompositionLevelCount,
|
||||
lwe_per_glwe: u32,
|
||||
storage_log_modulus: u32,
|
||||
num_blocks: u32,
|
||||
) -> u64 {
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
@@ -816,7 +840,6 @@ pub fn get_compression_size_on_gpu(
|
||||
carry_modulus.0 as u32,
|
||||
PBSType::Classical as u32,
|
||||
lwe_per_glwe,
|
||||
storage_log_modulus,
|
||||
false,
|
||||
)
|
||||
};
|
||||
@@ -839,10 +862,9 @@ pub fn get_compression_size_on_gpu(
|
||||
/// is required
|
||||
pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
|
||||
streams: &CudaStreams,
|
||||
lwe_array_out: &mut CudaVec<T>,
|
||||
glwe_in: &CudaVec<T>,
|
||||
lwe_array_out: &mut CudaLweCiphertextList<T>,
|
||||
glwe_in: &CudaPackedGlweCiphertextList,
|
||||
bootstrapping_key: &CudaVec<B>,
|
||||
bodies_count: u32,
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
encryption_glwe_dimension: GlweDimension,
|
||||
@@ -852,23 +874,22 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
|
||||
lwe_dimension: LweDimension,
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_level: DecompositionLevelCount,
|
||||
storage_log_modulus: u32,
|
||||
vec_indexes: &[u32],
|
||||
num_lwes: u32,
|
||||
num_blocks_to_decompress: u32,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
lwe_array_out.gpu_index(0),
|
||||
lwe_array_out.0.d_vec.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.gpu_index(0).get(),
|
||||
lwe_array_out.0.d_vec.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
glwe_in.gpu_index(0),
|
||||
glwe_in.data.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
glwe_in.gpu_index(0).get(),
|
||||
glwe_in.data.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
@@ -878,6 +899,10 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
|
||||
bootstrapping_key.gpu_index(0).get(),
|
||||
);
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
|
||||
let mut lwe_array_out_ffi = prepare_cuda_lwe_ct_ffi(lwe_array_out);
|
||||
let glwe_array_in_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_in);
|
||||
|
||||
scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams.ptr.as_ptr(),
|
||||
streams.gpu_indexes_ptr(),
|
||||
@@ -890,12 +915,10 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
|
||||
lwe_dimension.0 as u32,
|
||||
pbs_level.0 as u32,
|
||||
pbs_base_log.0 as u32,
|
||||
num_lwes,
|
||||
num_blocks_to_decompress,
|
||||
message_modulus.0 as u32,
|
||||
carry_modulus.0 as u32,
|
||||
PBSType::Classical as u32,
|
||||
storage_log_modulus,
|
||||
bodies_count,
|
||||
true,
|
||||
false,
|
||||
);
|
||||
@@ -904,10 +927,9 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
|
||||
streams.ptr.as_ptr(),
|
||||
streams.gpu_indexes_ptr(),
|
||||
streams.len() as u32,
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
glwe_in.as_c_ptr(0),
|
||||
&raw mut lwe_array_out_ffi,
|
||||
&raw const glwe_array_in_ffi,
|
||||
vec_indexes.as_ptr(),
|
||||
vec_indexes.len() as u32,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
mem_ptr,
|
||||
);
|
||||
@@ -923,7 +945,6 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn get_decompression_size_on_gpu(
|
||||
streams: &CudaStreams,
|
||||
bodies_count: u32,
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
encryption_glwe_dimension: GlweDimension,
|
||||
@@ -933,8 +954,7 @@ pub fn get_decompression_size_on_gpu(
|
||||
lwe_dimension: LweDimension,
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
pbs_level: DecompositionLevelCount,
|
||||
storage_log_modulus: u32,
|
||||
num_lwes: u32,
|
||||
num_blocks_to_decompress: u32,
|
||||
) -> u64 {
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let size_tracker = unsafe {
|
||||
@@ -950,12 +970,10 @@ pub fn get_decompression_size_on_gpu(
|
||||
lwe_dimension.0 as u32,
|
||||
pbs_level.0 as u32,
|
||||
pbs_base_log.0 as u32,
|
||||
num_lwes,
|
||||
num_blocks_to_decompress,
|
||||
message_modulus.0 as u32,
|
||||
carry_modulus.0 as u32,
|
||||
PBSType::Classical as u32,
|
||||
storage_log_modulus,
|
||||
bodies_count,
|
||||
false,
|
||||
false,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user