refactor(gpu): cleaning compression

This commit is contained in:
Enzo Di Maria
2025-07-30 16:50:51 +02:00
committed by enzodimaria
parent 46a7229c81
commit d1c417bf71
10 changed files with 295 additions and 237 deletions

View File

@@ -3,6 +3,26 @@
#include "../../pbs/pbs_enums.h"
typedef struct {
void *ptr;
uint32_t num_radix_blocks;
uint32_t lwe_dimension;
} CudaLweCiphertextListFFI;
typedef struct {
void *ptr;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;
// Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
// smaller)
// Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
// each LWE of the group). In the end the total number of bodies is equal to
// the number of input LWE
uint32_t total_lwe_bodies_count;
uint32_t glwe_dimension;
uint32_t polynomial_size;
} CudaPackedGlweCiphertextListFFI;
extern "C" {
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
@@ -10,28 +30,29 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory);
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
bool allocate_ms_array);
void cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
uint32_t num_nths, int8_t *mem_ptr);
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
int8_t *mem_ptr);
void cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr);
CudaLweCiphertextListFFI *lwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);
void cleanup_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,

View File

@@ -5,11 +5,6 @@
template <typename Torus> struct int_compression {
int_radix_params compression_params;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;
uint32_t body_count;
// Compression
int8_t *fp_ks_buffer;
Torus *tmp_lwe;
@@ -19,13 +14,9 @@ template <typename Torus> struct int_compression {
int_compression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
uint32_t storage_log_modulus, bool allocate_gpu_memory,
uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->compression_params = compression_params;
this->lwe_per_glwe = lwe_per_glwe;
this->storage_log_modulus = storage_log_modulus;
this->body_count = num_radix_blocks;
uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
@@ -58,11 +49,7 @@ template <typename Torus> struct int_compression {
template <typename Torus> struct int_decompression {
int_radix_params encryption_params;
int_radix_params compression_params;
uint32_t storage_log_modulus;
uint32_t num_radix_blocks;
uint32_t body_count;
uint32_t num_blocks_to_decompress;
Torus *tmp_extracted_glwe;
Torus *tmp_extracted_lwe;
@@ -74,15 +61,12 @@ template <typename Torus> struct int_decompression {
int_decompression(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params encryption_params,
int_radix_params compression_params,
uint32_t num_radix_blocks, uint32_t body_count,
uint32_t storage_log_modulus, bool allocate_gpu_memory,
uint32_t num_blocks_to_decompress, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->encryption_params = encryption_params;
this->compression_params = compression_params;
this->storage_log_modulus = storage_log_modulus;
this->num_radix_blocks = num_radix_blocks;
this->body_count = body_count;
this->num_blocks_to_decompress = num_blocks_to_decompress;
uint64_t glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
@@ -90,18 +74,18 @@ template <typename Torus> struct int_decompression {
compression_params.polynomial_size +
1);
decompression_rescale_lut = new int_radix_lut<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
streams, gpu_indexes, gpu_count, encryption_params, 1,
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
tmp_extracted_glwe = (Torus *)cuda_malloc_with_size_tracking_async(
num_radix_blocks * glwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0], size_tracker, allocate_gpu_memory);
num_blocks_to_decompress * glwe_accumulator_size * sizeof(Torus),
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
tmp_indexes_array = (uint32_t *)cuda_malloc_with_size_tracking_async(
num_radix_blocks * sizeof(uint32_t), streams[0], gpu_indexes[0],
num_blocks_to_decompress * sizeof(uint32_t), streams[0], gpu_indexes[0],
size_tracker, allocate_gpu_memory);
tmp_extracted_lwe = (Torus *)cuda_malloc_with_size_tracking_async(
num_radix_blocks * lwe_accumulator_size * sizeof(Torus), streams[0],
gpu_indexes[0], size_tracker, allocate_gpu_memory);
num_blocks_to_decompress * lwe_accumulator_size * sizeof(Torus),
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
// Rescale is done using an identity LUT
// Here we do not divide by message_modulus

View File

@@ -123,7 +123,6 @@ __host__ void host_modulus_switch_inplace(cudaStream_t stream,
int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
modulus_switch_inplace<Torus>
<<<num_blocks, num_threads, 0, stream>>>(array, size, log_modulus);
check_cuda_error(cudaGetLastError());

View File

@@ -6,8 +6,7 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
int_radix_params compression_params(
pbs_type, compression_glwe_dimension, compression_polynomial_size,
@@ -18,17 +17,16 @@ uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
return scratch_cuda_compress_integer_radix_ciphertext<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_compression<uint64_t> **)mem_ptr, num_radix_blocks,
compression_params, lwe_per_glwe, storage_log_modulus,
allocate_gpu_memory);
compression_params, lwe_per_glwe, allocate_gpu_memory);
}
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t storage_log_modulus, uint32_t body_count, bool allocate_gpu_memory,
uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
bool allocate_ms_array) {
// Decompression doesn't keyswitch, so big and small dimensions are the same
@@ -45,31 +43,29 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
return scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_decompression<uint64_t> **)mem_ptr, num_radix_blocks, body_count,
encryption_params, compression_params, storage_log_modulus,
allocate_gpu_memory);
(int_decompression<uint64_t> **)mem_ptr, num_blocks_to_decompress,
encryption_params, compression_params, allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *glwe_array_out, void const *lwe_array_in, void *const *fp_ksk,
uint32_t num_nths, int8_t *mem_ptr) {
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
int8_t *mem_ptr) {
host_integer_compress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(glwe_array_out),
static_cast<const uint64_t *>(lwe_array_in), (uint64_t *const *)(fp_ksk),
num_nths, (int_compression<uint64_t> *)mem_ptr);
host_integer_compress<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, glwe_array_out, lwe_array_in,
(uint64_t *const *)(fp_ksk),
(int_compression<uint64_t> *)mem_ptr);
}
void cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *glwe_in, uint32_t const *indexes_array,
uint32_t indexes_array_size, void *const *bsks, int8_t *mem_ptr) {
CudaLweCiphertextListFFI *lwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr) {
host_integer_decompress<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(glwe_in), indexes_array, indexes_array_size,
bsks, (int_decompression<uint64_t> *)mem_ptr);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out, glwe_in,
indexes_array, bsks, (int_decompression<uint64_t> *)mem_ptr);
}
void cleanup_cuda_integer_compress_radix_ciphertext_64(

View File

@@ -48,19 +48,19 @@ __global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
/// implementation
template <typename Torus>
__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
Torus *array_out, Torus *array_in, uint32_t num_glwes,
uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
if (array_in == array_out)
CudaPackedGlweCiphertextListFFI *array_out,
Torus *array_in, uint32_t num_glwes,
int_radix_params compression_params) {
if (array_in == (Torus *)array_out->ptr)
PANIC("Cuda error: Input and output must be different");
cuda_set_device(gpu_index);
auto compression_params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;
auto log_modulus = array_out->storage_log_modulus;
// [0..num_glwes-1) GLWEs
auto in_len = num_glwes * compression_params.glwe_dimension *
compression_params.polynomial_size +
num_lwes;
array_out->total_lwe_bodies_count;
auto number_bits_to_pack = in_len * log_modulus;
@@ -73,18 +73,18 @@ __host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
dim3 grid(num_blocks);
dim3 threads(num_threads);
pack<Torus><<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
out_len, in_len, out_len);
pack<Torus><<<grid, threads, 0, stream>>>(
(Torus *)array_out->ptr, array_in, log_modulus, out_len, in_len, out_len);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void
host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *glwe_array_out,
Torus const *lwe_array_in, Torus *const *fp_ksk,
uint32_t num_radix_blocks,
int_compression<Torus> *mem_ptr) {
uint32_t gpu_count,
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in,
Torus *const *fp_ksk, int_compression<Torus> *mem_ptr) {
auto compression_params = mem_ptr->compression_params;
auto input_lwe_dimension = compression_params.small_lwe_dimension;
@@ -93,14 +93,14 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
auto lwe_shifted = mem_ptr->tmp_lwe;
host_cleartext_multiplication<Torus>(
streams[0], gpu_indexes[0], lwe_shifted, lwe_array_in,
(uint64_t)compression_params.message_modulus, input_lwe_dimension,
num_radix_blocks);
(uint64_t)compression_params.message_modulus);
uint32_t lwe_in_size = input_lwe_dimension + 1;
uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
uint32_t num_glwes =
(num_radix_blocks + mem_ptr->lwe_per_glwe - 1) / mem_ptr->lwe_per_glwe;
uint32_t num_glwes = (glwe_array_out->total_lwe_bodies_count +
glwe_array_out->lwe_per_glwe - 1) /
glwe_array_out->lwe_per_glwe;
// Keyswitch LWEs to GLWE
auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
@@ -109,12 +109,13 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
compression_params.polynomial_size * sizeof(Torus),
streams[0], gpu_indexes[0]);
auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
auto rem_lwes = num_radix_blocks;
auto rem_lwes = glwe_array_out->total_lwe_bodies_count;
auto lwe_subset = lwe_shifted;
auto glwe_out = tmp_glwe_array_out;
while (rem_lwes > 0) {
auto chunk_size = min(rem_lwes, mem_ptr->lwe_per_glwe);
auto chunk_size = min(rem_lwes, glwe_array_out->lwe_per_glwe);
host_packing_keyswitch_lwe_list_to_glwe<Torus>(
streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
@@ -128,15 +129,16 @@ host_integer_compress(cudaStream_t const *streams, uint32_t const *gpu_indexes,
}
// Modulus switch
host_modulus_switch_inplace<Torus>(
streams[0], gpu_indexes[0], tmp_glwe_array_out,
num_glwes * compression_params.glwe_dimension *
compression_params.polynomial_size +
num_radix_blocks,
mem_ptr->storage_log_modulus);
int size = num_glwes * compression_params.glwe_dimension *
compression_params.polynomial_size +
glwe_array_out->total_lwe_bodies_count;
host_modulus_switch_inplace<Torus>(streams[0], gpu_indexes[0],
tmp_glwe_array_out, size,
glwe_array_out->storage_log_modulus);
host_pack<Torus>(streams[0], gpu_indexes[0], glwe_array_out,
tmp_glwe_array_out, num_glwes, num_radix_blocks, mem_ptr);
tmp_glwe_array_out, num_glwes, compression_params);
}
template <typename Torus>
@@ -176,34 +178,38 @@ __global__ void extract(Torus *glwe_array_out, Torus const *array_in,
/// This function follows the naming used in the CPU implementation
template <typename Torus>
__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
Torus *glwe_array_out, Torus const *array_in,
uint32_t glwe_index,
int_decompression<Torus> *mem_ptr) {
if (array_in == glwe_array_out)
Torus *glwe_array_out,
CudaPackedGlweCiphertextListFFI const *array_in,
uint32_t glwe_index) {
if ((Torus *)array_in->ptr == glwe_array_out)
PANIC("Cuda error: Input and output must be different");
cuda_set_device(gpu_index);
auto compression_params = mem_ptr->compression_params;
auto log_modulus = mem_ptr->storage_log_modulus;
auto glwe_ciphertext_size = (compression_params.glwe_dimension + 1) *
compression_params.polynomial_size;
auto log_modulus = array_in->storage_log_modulus;
auto total_lwe_bodies_count = array_in->total_lwe_bodies_count;
auto polynomial_size = array_in->polynomial_size;
auto glwe_dimension = array_in->glwe_dimension;
uint32_t body_count = mem_ptr->body_count;
auto num_glwes = (body_count + compression_params.polynomial_size - 1) /
compression_params.polynomial_size;
auto glwe_ciphertext_size = (glwe_dimension + 1) * polynomial_size;
uint32_t num_glwes =
(total_lwe_bodies_count + polynomial_size - 1) / polynomial_size;
// Compressed length of the compressed GLWE we want to extract
if (mem_ptr->body_count % compression_params.polynomial_size == 0)
body_count = compression_params.polynomial_size;
else if (glwe_index == num_glwes - 1)
body_count = mem_ptr->body_count % compression_params.polynomial_size;
else
body_count = compression_params.polynomial_size;
uint32_t body_count = 0;
if (glwe_index == num_glwes - 1) {
auto remainder = total_lwe_bodies_count % polynomial_size;
if (remainder == 0) {
body_count = polynomial_size;
} else {
body_count = remainder;
}
} else {
body_count = polynomial_size;
}
auto initial_out_len =
compression_params.glwe_dimension * compression_params.polynomial_size +
body_count;
uint32_t initial_out_len = glwe_dimension * polynomial_size + body_count;
// Calculates how many bits this particular GLWE shall use
auto number_bits_to_unpack = initial_out_len * log_modulus;
@@ -213,7 +219,7 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
number_bits_to_unpack = glwe_ciphertext_size * log_modulus;
auto len = (number_bits_to_unpack + nbits - 1) / nbits;
// Uses that length to set the input pointer
auto chunk_array_in = array_in + glwe_index * len;
auto chunk_array_in = (Torus *)array_in->ptr + glwe_index * len;
// Ensure the tail of the GLWE is zeroed
if (initial_out_len < glwe_ciphertext_size) {
@@ -232,25 +238,24 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
}
template <typename Torus>
__host__ void host_integer_decompress(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *d_lwe_array_out, Torus const *d_packed_glwe_in,
uint32_t const *h_indexes_array, uint32_t indexes_array_size,
void *const *d_bsks, int_decompression<Torus> *h_mem_ptr) {
__host__ void
host_integer_decompress(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaLweCiphertextListFFI *d_lwe_array_out,
CudaPackedGlweCiphertextListFFI const *d_packed_glwe_in,
uint32_t const *h_indexes_array, void *const *d_bsks,
int_decompression<Torus> *h_mem_ptr) {
auto num_blocks_to_decompress = h_mem_ptr->num_blocks_to_decompress;
auto d_indexes_array = h_mem_ptr->tmp_indexes_array;
cuda_memcpy_async_to_gpu(d_indexes_array, (void *)h_indexes_array,
indexes_array_size * sizeof(uint32_t), streams[0],
gpu_indexes[0]);
num_blocks_to_decompress * sizeof(uint32_t),
streams[0], gpu_indexes[0]);
auto compression_params = h_mem_ptr->compression_params;
auto lwe_per_glwe = compression_params.polynomial_size;
auto num_radix_blocks = h_mem_ptr->num_radix_blocks;
if (num_radix_blocks != indexes_array_size)
PANIC("Cuda error: wrong number of LWEs in decompress: the number of LWEs "
"should be the same as indexes_array_size.")
// the first element is the number of LWEs that lies in the related GLWE
std::vector<std::pair<int, Torus *>> glwe_vec;
@@ -261,16 +266,16 @@ __host__ void host_integer_decompress(
auto current_glwe_index = h_indexes_array[0] / lwe_per_glwe;
auto extracted_glwe = h_mem_ptr->tmp_extracted_glwe;
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
d_packed_glwe_in, current_glwe_index, h_mem_ptr);
d_packed_glwe_in, current_glwe_index);
glwe_vec.push_back(std::make_pair(1, extracted_glwe));
for (int i = 1; i < indexes_array_size; i++) {
for (int i = 1; i < num_blocks_to_decompress; i++) {
auto glwe_index = h_indexes_array[i] / lwe_per_glwe;
if (glwe_index != current_glwe_index) {
extracted_glwe += glwe_accumulator_size;
current_glwe_index = glwe_index;
// Extracts a new GLWE
host_extract<Torus>(streams[0], gpu_indexes[0], extracted_glwe,
d_packed_glwe_in, glwe_index, h_mem_ptr);
d_packed_glwe_in, glwe_index);
glwe_vec.push_back(std::make_pair(1, extracted_glwe));
} else {
// Updates the quantity
@@ -306,17 +311,19 @@ __host__ void host_integer_decompress(
/// dimension to a big LWE dimension
auto encryption_params = h_mem_ptr->encryption_params;
auto lut = h_mem_ptr->decompression_rescale_lut;
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
auto active_gpu_count =
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
if (active_gpu_count == 1) {
execute_pbs_async<Torus>(
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, extracted_lwe,
lut->lwe_indexes_in, d_bsks, nullptr, lut->buffer,
encryption_params.glwe_dimension,
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
lut_stride);
} else {
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
@@ -332,7 +339,7 @@ __host__ void host_integer_decompress(
multi_gpu_scatter_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, extracted_lwe,
lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
lut->active_gpu_count, num_radix_blocks,
lut->active_gpu_count, num_blocks_to_decompress,
compression_params.small_lwe_dimension + 1);
/// Apply PBS
@@ -344,13 +351,14 @@ __host__ void host_integer_decompress(
compression_params.small_lwe_dimension,
encryption_params.polynomial_size, encryption_params.pbs_base_log,
encryption_params.pbs_level, encryption_params.grouping_factor,
num_radix_blocks, encryption_params.pbs_type, num_many_lut, lut_stride);
num_blocks_to_decompress, encryption_params.pbs_type, num_many_lut,
lut_stride);
/// Copy data back to GPU 0 and release vecs
multi_gpu_gather_lwe_async<Torus>(
streams, gpu_indexes, active_gpu_count, d_lwe_array_out,
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
lwe_after_pbs_vec, lut->h_lwe_indexes_out,
lut->using_trivial_lwe_indexes, num_radix_blocks,
lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
encryption_params.big_lwe_dimension + 1);
/// Synchronize all GPUs
@@ -365,13 +373,12 @@ __host__ uint64_t scratch_cuda_compress_integer_radix_ciphertext(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_compression<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params compression_params,
uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
bool allocate_gpu_memory) {
uint32_t lwe_per_glwe, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_compression<Torus>(
streams, gpu_indexes, gpu_count, compression_params, num_radix_blocks,
lwe_per_glwe, storage_log_modulus, allocate_gpu_memory, size_tracker);
lwe_per_glwe, allocate_gpu_memory, size_tracker);
return size_tracker;
}
@@ -379,15 +386,13 @@ template <typename Torus>
__host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_decompression<Torus> **mem_ptr,
uint32_t num_radix_blocks, uint32_t body_count,
int_radix_params encryption_params, int_radix_params compression_params,
uint32_t storage_log_modulus, bool allocate_gpu_memory) {
uint32_t num_blocks_to_decompress, int_radix_params encryption_params,
int_radix_params compression_params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_decompression<Torus>(
streams, gpu_indexes, gpu_count, encryption_params, compression_params,
num_radix_blocks, body_count, storage_log_modulus, allocate_gpu_memory,
size_tracker);
num_blocks_to_decompress, allocate_gpu_memory, size_tracker);
return size_tracker;
}
#endif

View File

@@ -13,6 +13,8 @@
#include <iostream>
#include <vector>
#include "integer/compression/compression.h"
template <typename T>
__global__ void cleartext_vec_multiplication(T *output, T const *lwe_input,
T const *cleartext_input,
@@ -51,9 +53,9 @@ __host__ void host_cleartext_vec_multiplication(
}
template <typename T>
__global__ void
cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input,
uint32_t input_lwe_dimension, uint32_t num_entries) {
__global__ void cleartext_multiplication(T *output, T const *lwe_input,
T cleartext_input,
const uint32_t num_entries) {
int tid = threadIdx.x;
int index = blockIdx.x * blockDim.x + tid;
@@ -64,25 +66,21 @@ cleartext_multiplication(T *output, T const *lwe_input, T cleartext_input,
}
template <typename T>
__host__ void
host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
T *output, T const *lwe_input, T cleartext_input,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count) {
__host__ void host_cleartext_multiplication(
cudaStream_t stream, uint32_t gpu_index, T *output,
CudaLweCiphertextListFFI const *lwe_input, T cleartext_input) {
cuda_set_device(gpu_index);
// lwe_size includes the presence of the body
// whereas lwe_dimension is the number of elements in the mask
int lwe_size = input_lwe_dimension + 1;
// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = input_lwe_ciphertext_count * lwe_size;
uint32_t num_entries =
lwe_input->num_radix_blocks * (lwe_input->lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);
cleartext_multiplication<T><<<grid, thds, 0, stream>>>(
output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
output, (T *)lwe_input->ptr, cleartext_input, num_entries);
check_cuda_error(cudaGetLastError());
}

View File

@@ -105,6 +105,55 @@ const _: () = {
ms_input_variance
) - 32usize];
};
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CudaLweCiphertextListFFI {
pub ptr: *mut ffi::c_void,
pub num_radix_blocks: u32,
pub lwe_dimension: u32,
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
["Size of CudaLweCiphertextListFFI"]
[::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
["Alignment of CudaLweCiphertextListFFI"]
[::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
["Offset of field: CudaLweCiphertextListFFI::ptr"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
};
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CudaPackedGlweCiphertextListFFI {
pub ptr: *mut ffi::c_void,
pub storage_log_modulus: u32,
pub lwe_per_glwe: u32,
pub total_lwe_bodies_count: u32,
pub glwe_dimension: u32,
pub polynomial_size: u32,
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
["Size of CudaPackedGlweCiphertextListFFI"]
[::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
["Alignment of CudaPackedGlweCiphertextListFFI"]
[::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
};
unsafe extern "C" {
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
@@ -121,7 +170,6 @@ unsafe extern "C" {
carry_modulus: u32,
pbs_type: PBS_TYPE,
lwe_per_glwe: u32,
storage_log_modulus: u32,
allocate_gpu_memory: bool,
) -> u64;
}
@@ -138,12 +186,10 @@ unsafe extern "C" {
lwe_dimension: u32,
pbs_level: u32,
pbs_base_log: u32,
num_radix_blocks: u32,
num_blocks_to_decompress: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
storage_log_modulus: u32,
body_count: u32,
allocate_gpu_memory: bool,
allocate_ms_array: bool,
) -> u64;
@@ -153,10 +199,9 @@ unsafe extern "C" {
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
glwe_array_out: *mut ffi::c_void,
lwe_array_in: *const ffi::c_void,
glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
lwe_array_in: *const CudaLweCiphertextListFFI,
fp_ksk: *const *mut ffi::c_void,
num_nths: u32,
mem_ptr: *mut i8,
);
}
@@ -165,10 +210,9 @@ unsafe extern "C" {
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array_out: *mut ffi::c_void,
glwe_in: *const ffi::c_void,
lwe_array_out: *mut CudaLweCiphertextListFFI,
glwe_in: *const CudaPackedGlweCiphertextListFFI,
indexes_array: *const u32,
indexes_array_size: u32,
bsks: *const *mut ffi::c_void,
mem_ptr: *mut i8,
);

View File

@@ -261,7 +261,7 @@ impl CudaCompressedCiphertextList {
v
};
let mut num_bodies_left = gpu_meta.bodies_count;
let mut num_bodies_left = gpu_meta.total_lwe_bodies_count;
let mut chunk_start = 0;
while num_bodies_left != 0 {
let bodies_count = LweCiphertextCount(num_bodies_left.min(lwe_per_glwe.0));
@@ -427,7 +427,7 @@ impl CompressedCiphertextList {
ciphertext_modulus: cpu_meta.ciphertext_modulus,
storage_log_modulus: first_ct.packed_integers().log_modulus(),
lwe_per_glwe,
bodies_count: self.packed_list.len(),
total_lwe_bodies_count: self.packed_list.len(),
initial_len,
})
});

View File

@@ -49,7 +49,7 @@ pub struct CudaPackedGlweCiphertextListMeta {
pub storage_log_modulus: CiphertextModulusLog,
pub lwe_per_glwe: LweCiphertextCount,
// Number of lwe bodies that are compressed in this list
pub bodies_count: usize,
pub total_lwe_bodies_count: usize,
// Number of elements (u64) the uncompressed GLWE list had
// keep in mind the last GLWE may not be full
pub initial_len: usize,
@@ -69,7 +69,9 @@ impl CudaPackedGlweCiphertextList {
pub fn bodies_count(&self) -> usize {
// If there is no metadata, the list is empty
self.meta.map(|meta| meta.bodies_count).unwrap_or_default()
self.meta
.map(|meta| meta.total_lwe_bodies_count)
.unwrap_or_default()
}
pub fn glwe_ciphertext_count(&self) -> GlweCiphertextCount {
@@ -180,7 +182,7 @@ impl CudaCompressionKey {
let uncompressed_len = num_glwes * glwe_mask_size + num_lwes;
let number_bits_to_pack = uncompressed_len * self.storage_log_modulus.0;
let compressed_len = number_bits_to_pack.div_ceil(u64::BITS as usize);
let mut packed_glwe_list = CudaVec::new(compressed_len, streams, 0);
let packed_glwe_list = CudaVec::new(compressed_len, streams, 0);
if ciphertexts.is_empty() {
return CudaPackedGlweCiphertextList {
@@ -194,16 +196,30 @@ impl CudaCompressionKey {
let first_ct_info = first_ct.info.blocks.first().unwrap();
let message_modulus = first_ct_info.message_modulus;
let carry_modulus = first_ct_info.carry_modulus;
let lwe_dimension = first_ct.d_blocks.lwe_dimension();
let mut glwe_array_out = CudaPackedGlweCiphertextList {
data: packed_glwe_list,
meta: Some(CudaPackedGlweCiphertextListMeta {
glwe_dimension: compressed_glwe_size.to_glwe_dimension(),
polynomial_size: compressed_polynomial_size,
message_modulus,
carry_modulus,
ciphertext_modulus,
storage_log_modulus: self.storage_log_modulus,
lwe_per_glwe: LweCiphertextCount(compressed_polynomial_size.0),
total_lwe_bodies_count: num_lwes,
initial_len: uncompressed_len,
}),
};
unsafe {
let input_lwes = Self::flatten_async(ciphertexts, streams);
compress_integer_radix_async(
streams,
&mut packed_glwe_list,
&input_lwes.0.d_vec,
&mut glwe_array_out,
&input_lwes,
&self.packing_key_switching_key.d_vec,
message_modulus,
carry_modulus,
@@ -213,29 +229,13 @@ impl CudaCompressionKey {
lwe_pksk.decomposition_base_log(),
lwe_pksk.decomposition_level_count(),
self.lwe_per_glwe.0 as u32,
self.storage_log_modulus.0 as u32,
num_lwes as u32,
);
streams.synchronize();
};
let meta = Some(CudaPackedGlweCiphertextListMeta {
glwe_dimension: compressed_glwe_size.to_glwe_dimension(),
polynomial_size: compressed_polynomial_size,
message_modulus,
carry_modulus,
ciphertext_modulus,
storage_log_modulus: self.storage_log_modulus,
lwe_per_glwe: LweCiphertextCount(compressed_polynomial_size.0),
bodies_count: num_lwes,
initial_len: uncompressed_len,
});
CudaPackedGlweCiphertextList {
data: packed_glwe_list,
meta,
}
glwe_array_out
}
pub fn get_compression_size_on_gpu(
&self,
@@ -259,7 +259,6 @@ impl CudaCompressionKey {
lwe_pksk.decomposition_base_log(),
lwe_pksk.decomposition_level_count(),
self.lwe_per_glwe.0 as u32,
self.storage_log_modulus.0 as u32,
num_lwes,
)
}
@@ -308,7 +307,6 @@ impl CudaDecompressionKey {
let message_modulus = self.message_modulus;
let carry_modulus = self.carry_modulus;
let ciphertext_modulus = self.ciphertext_modulus;
let storage_log_modulus = meta.storage_log_modulus;
match &self.blind_rotate_key {
CudaBootstrappingKey::Classic(bsk) => {
@@ -328,10 +326,9 @@ impl CudaDecompressionKey {
unsafe {
decompress_integer_radix_async(
streams,
&mut output_lwe.0.d_vec,
&packed_list.data,
&mut output_lwe,
packed_list,
&bsk.d_vec,
meta.bodies_count as u32,
message_modulus,
carry_modulus,
encryption_glwe_dimension,
@@ -341,7 +338,6 @@ impl CudaDecompressionKey {
lwe_dimension,
bsk.decomp_base_log(),
bsk.decomp_level_count(),
storage_log_modulus.0 as u32,
indexes_array.as_slice(),
indexes_array_len.0 as u32,
);
@@ -403,7 +399,6 @@ impl CudaDecompressionKey {
let message_modulus = self.message_modulus;
let carry_modulus = self.carry_modulus;
let storage_log_modulus = meta.storage_log_modulus;
match &self.blind_rotate_key {
CudaBootstrappingKey::Classic(bsk) => {
@@ -415,7 +410,6 @@ impl CudaDecompressionKey {
get_decompression_size_on_gpu(
streams,
packed_list.bodies_count() as u32,
message_modulus,
carry_modulus,
encryption_glwe_dimension,
@@ -425,7 +419,6 @@ impl CudaDecompressionKey {
lwe_dimension,
bsk.decomp_base_log(),
bsk.decomp_level_count(),
storage_log_modulus.0 as u32,
indexes_array_len.0 as u32,
)
}

View File

@@ -21,6 +21,7 @@ use crate::core_crypto::prelude::{
use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, KsType};
use crate::integer::gpu::list_compression::server_keys::CudaPackedGlweCiphertextList;
use crate::integer::server_key::radix_parallel::scalar_div_mod::{
choose_multiplier, SignedReciprocable,
};
@@ -100,6 +101,29 @@ pub fn prepare_default_scalar_divisor() -> CudaScalarDivisorFFI {
}
}
fn prepare_cuda_lwe_ct_ffi<T: UnsignedInteger>(
input: &CudaLweCiphertextList<T>,
) -> CudaLweCiphertextListFFI {
CudaLweCiphertextListFFI {
ptr: input.0.d_vec.get_mut_c_ptr(0),
num_radix_blocks: input.0.lwe_ciphertext_count.0 as u32,
lwe_dimension: input.0.lwe_dimension.0 as u32,
}
}
fn prepare_cuda_packed_glwe_ct_ffi(
input: &CudaPackedGlweCiphertextList,
) -> CudaPackedGlweCiphertextListFFI {
CudaPackedGlweCiphertextListFFI {
ptr: input.data.get_mut_c_ptr(0),
storage_log_modulus: input.meta.unwrap().storage_log_modulus.0 as u32,
lwe_per_glwe: input.meta.unwrap().lwe_per_glwe.0 as u32,
total_lwe_bodies_count: input.meta.unwrap().total_lwe_bodies_count as u32,
glwe_dimension: input.meta.unwrap().glwe_dimension.0 as u32,
polynomial_size: input.meta.unwrap().polynomial_size.0 as u32,
}
}
// If we build the Vec<u64> inside prepare_cuda_radix_ffi
// the data gets dropped before the call to the Cuda function,
// and we get memory errors, hence why the reconstruction of
@@ -711,8 +735,8 @@ where
/// is required
pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
streams: &CudaStreams,
glwe_array_out: &mut CudaVec<T>,
lwe_array_in: &CudaVec<T>,
glwe_array_out: &mut CudaPackedGlweCiphertextList,
lwe_array_in: &CudaLweCiphertextList<T>,
fp_keyswitch_key: &CudaVec<u64>,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
@@ -722,23 +746,8 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
ks_base_log: DecompositionBaseLog,
ks_level: DecompositionLevelCount,
lwe_per_glwe: u32,
storage_log_modulus: u32,
num_blocks: u32,
) {
assert_eq!(
streams.gpu_indexes[0],
glwe_array_out.gpu_index(0),
"GPU error: first stream is on GPU {}, first glwe output pointer is on GPU {}",
streams.gpu_indexes[0].get(),
glwe_array_out.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
lwe_array_in.gpu_index(0),
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
streams.gpu_indexes[0].get(),
lwe_array_in.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
fp_keyswitch_key.gpu_index(0),
@@ -746,7 +755,25 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
streams.gpu_indexes[0].get(),
fp_keyswitch_key.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
lwe_array_in.0.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
streams.gpu_indexes[0].get(),
lwe_array_in.0.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
glwe_array_out.data.gpu_index(0),
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
streams.gpu_indexes[0].get(),
glwe_array_out.data.gpu_index(0).get(),
);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let array_in_ffi = prepare_cuda_lwe_ct_ffi(lwe_array_in);
let mut glwe_array_out_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_array_out);
scratch_cuda_integer_compress_radix_ciphertext_64(
streams.ptr.as_ptr(),
streams.gpu_indexes_ptr(),
@@ -762,7 +789,6 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
carry_modulus.0 as u32,
PBSType::Classical as u32,
lwe_per_glwe,
storage_log_modulus,
true,
);
@@ -770,10 +796,9 @@ pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
streams.ptr.as_ptr(),
streams.gpu_indexes_ptr(),
streams.len() as u32,
glwe_array_out.as_mut_c_ptr(0),
lwe_array_in.as_c_ptr(0),
&raw mut glwe_array_out_ffi,
&raw const array_in_ffi,
fp_keyswitch_key.ptr.as_ptr(),
num_blocks,
mem_ptr,
);
@@ -796,7 +821,6 @@ pub fn get_compression_size_on_gpu(
ks_base_log: DecompositionBaseLog,
ks_level: DecompositionLevelCount,
lwe_per_glwe: u32,
storage_log_modulus: u32,
num_blocks: u32,
) -> u64 {
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
@@ -816,7 +840,6 @@ pub fn get_compression_size_on_gpu(
carry_modulus.0 as u32,
PBSType::Classical as u32,
lwe_per_glwe,
storage_log_modulus,
false,
)
};
@@ -839,10 +862,9 @@ pub fn get_compression_size_on_gpu(
/// is required
pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
streams: &CudaStreams,
lwe_array_out: &mut CudaVec<T>,
glwe_in: &CudaVec<T>,
lwe_array_out: &mut CudaLweCiphertextList<T>,
glwe_in: &CudaPackedGlweCiphertextList,
bootstrapping_key: &CudaVec<B>,
bodies_count: u32,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
encryption_glwe_dimension: GlweDimension,
@@ -852,23 +874,22 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
lwe_dimension: LweDimension,
pbs_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
storage_log_modulus: u32,
vec_indexes: &[u32],
num_lwes: u32,
num_blocks_to_decompress: u32,
) {
assert_eq!(
streams.gpu_indexes[0],
lwe_array_out.gpu_index(0),
lwe_array_out.0.d_vec.gpu_index(0),
"GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
streams.gpu_indexes[0].get(),
lwe_array_out.gpu_index(0).get(),
lwe_array_out.0.d_vec.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
glwe_in.gpu_index(0),
glwe_in.data.gpu_index(0),
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
streams.gpu_indexes[0].get(),
glwe_in.gpu_index(0).get(),
glwe_in.data.gpu_index(0).get(),
);
assert_eq!(
streams.gpu_indexes[0],
@@ -878,6 +899,10 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
bootstrapping_key.gpu_index(0).get(),
);
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let mut lwe_array_out_ffi = prepare_cuda_lwe_ct_ffi(lwe_array_out);
let glwe_array_in_ffi = prepare_cuda_packed_glwe_ct_ffi(glwe_in);
scratch_cuda_integer_decompress_radix_ciphertext_64(
streams.ptr.as_ptr(),
streams.gpu_indexes_ptr(),
@@ -890,12 +915,10 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
lwe_dimension.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
num_lwes,
num_blocks_to_decompress,
message_modulus.0 as u32,
carry_modulus.0 as u32,
PBSType::Classical as u32,
storage_log_modulus,
bodies_count,
true,
false,
);
@@ -904,10 +927,9 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
streams.ptr.as_ptr(),
streams.gpu_indexes_ptr(),
streams.len() as u32,
lwe_array_out.as_mut_c_ptr(0),
glwe_in.as_c_ptr(0),
&raw mut lwe_array_out_ffi,
&raw const glwe_array_in_ffi,
vec_indexes.as_ptr(),
vec_indexes.len() as u32,
bootstrapping_key.ptr.as_ptr(),
mem_ptr,
);
@@ -923,7 +945,6 @@ pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
#[allow(clippy::too_many_arguments)]
pub fn get_decompression_size_on_gpu(
streams: &CudaStreams,
bodies_count: u32,
message_modulus: MessageModulus,
carry_modulus: CarryModulus,
encryption_glwe_dimension: GlweDimension,
@@ -933,8 +954,7 @@ pub fn get_decompression_size_on_gpu(
lwe_dimension: LweDimension,
pbs_base_log: DecompositionBaseLog,
pbs_level: DecompositionLevelCount,
storage_log_modulus: u32,
num_lwes: u32,
num_blocks_to_decompress: u32,
) -> u64 {
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
let size_tracker = unsafe {
@@ -950,12 +970,10 @@ pub fn get_decompression_size_on_gpu(
lwe_dimension.0 as u32,
pbs_level.0 as u32,
pbs_base_log.0 as u32,
num_lwes,
num_blocks_to_decompress,
message_modulus.0 as u32,
carry_modulus.0 as u32,
PBSType::Classical as u32,
storage_log_modulus,
bodies_count,
false,
false,
)