Compare commits

...

1 Commits

Author SHA1 Message Date
Beka Barbakadze
84e43630b4 feat(gpu): Implements optimized division algorithm for message_2_carry_2 2025-09-01 15:13:04 +04:00
6 changed files with 1295 additions and 274 deletions

View File

@@ -2,27 +2,7 @@
#define CUDA_INTEGER_COMPRESSION_H
#include "../../pbs/pbs_enums.h"
typedef struct {
void *ptr;
uint32_t num_radix_blocks;
uint32_t lwe_dimension;
} CudaLweCiphertextListFFI;
typedef struct {
void *ptr;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;
// Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
// smaller)
// Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
// each LWE of the group). In the end the total number of bodies is equal to
// the number of input LWE
uint32_t total_lwe_bodies_count;
uint32_t glwe_dimension;
uint32_t polynomial_size;
} CudaPackedGlweCiphertextListFFI;
#include "../integer.h"
extern "C" {
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,

View File

@@ -80,6 +80,26 @@ typedef struct {
bool const divisor_has_more_bits_than_numerator;
} CudaScalarDivisorFFI;
typedef struct {
void *ptr;
uint32_t num_radix_blocks;
uint32_t lwe_dimension;
} CudaLweCiphertextListFFI;
typedef struct {
void *ptr;
uint32_t storage_log_modulus;
uint32_t lwe_per_glwe;
// Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
// smaller)
// Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
// each LWE of the group). In the end the total number of bodies is equal to
// the number of input LWE
uint32_t total_lwe_bodies_count;
uint32_t glwe_dimension;
uint32_t polynomial_size;
} CudaPackedGlweCiphertextListFFI;
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,

View File

@@ -4153,6 +4153,771 @@ template <typename Torus> struct int_comparison_buffer {
}
};
template <typename Torus> struct int_sub_and_propagate {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *neg_rhs_array;
int_sc_prop_memory<Torus> *sc_prop_mem;
int_sub_and_propagate(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
const int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
this->neg_rhs_array = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
delete sc_prop_mem;
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
allocate_gpu_memory);
delete neg_rhs_array;
}
};
template <typename Torus> struct int_bitop_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
BITOP_TYPE op;
bool gpu_memory_allocated;
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
switch (op) {
case BITAND:
case BITOR:
case BITXOR:
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_radix_blocks, allocate_gpu_memory,
size_tracker);
{
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
if (op == BITOP_TYPE::BITAND) {
// AND
return lhs & rhs;
} else if (op == BITOP_TYPE::BITOR) {
// OR
return lhs | rhs;
} else {
// XOR
return lhs ^ rhs;
}
};
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
}
break;
default:
// Scalar OP
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
params.message_modulus, num_radix_blocks,
allocate_gpu_memory, size_tracker);
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
if (op == BITOP_TYPE::SCALAR_BITAND) {
// AND
return x & rhs;
} else if (op == BITOP_TYPE::SCALAR_BITOR) {
// OR
return x | rhs;
} else {
// XOR
return x ^ rhs;
}
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
}
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
lut->release(streams, gpu_indexes, gpu_count);
delete lut;
}
};
template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
bool gpu_memory_allocated;
int_radix_params params;
uint32_t active_gpu_count;
// memory objects for other operations
int_borrow_prop_memory<Torus> *overflow_sub_mem_1;
int_borrow_prop_memory<Torus> *overflow_sub_mem_2;
int_borrow_prop_memory<Torus> *overflow_sub_mem_3;
int_comparison_buffer<Torus> *comparison_buffer_1;
int_comparison_buffer<Torus> *comparison_buffer_2;
int_comparison_buffer<Torus> *comparison_buffer_3;
int_sub_and_propagate<Torus> *sub_and_propagate_mem;
int_bitop_buffer<Torus> *bitor_mem_1;
int_bitop_buffer<Torus> *bitor_mem_2;
int_bitop_buffer<Torus> *bitor_mem_3;
int_logical_scalar_shift_buffer<Torus> *shift_mem;
// lookup tables
int_radix_lut<Torus> *message_extract_lut_1;
int_radix_lut<Torus> *message_extract_lut_2;
int_radix_lut<Torus> *zero_out_if_not_1_lut_1;
int_radix_lut<Torus> *zero_out_if_not_1_lut_2;
int_radix_lut<Torus> *zero_out_if_not_2_lut_1;
int_radix_lut<Torus> *zero_out_if_not_2_lut_2;
int_radix_lut<Torus> *quotient_lut_1;
int_radix_lut<Torus> *quotient_lut_2;
int_radix_lut<Torus> *quotient_lut_3;
// sub streams
cudaStream_t *sub_streams_1;
cudaStream_t *sub_streams_2;
cudaStream_t *sub_streams_3;
cudaStream_t *sub_streams_4;
cudaStream_t *sub_streams_5;
cudaStream_t *sub_streams_6;
cudaStream_t *sub_streams_7;
// temporary device buffers
CudaRadixCiphertextFFI *d1; // num_blocks + 1
CudaRadixCiphertextFFI *d2; // num_blocks + 1
CudaRadixCiphertextFFI *d3; // num_blocks + 1
CudaRadixCiphertextFFI *low1; // num_blocks
CudaRadixCiphertextFFI *low2; // num_blocks
CudaRadixCiphertextFFI *low3; // num_blocks
CudaRadixCiphertextFFI *rem; // num_blocks
CudaRadixCiphertextFFI *sub_result_1; // num_blocks
CudaRadixCiphertextFFI *sub_result_2; // num_blocks
CudaRadixCiphertextFFI *sub_result_3; // num_blocks
CudaRadixCiphertextFFI *sub_1_overflowed; // num_blocks
CudaRadixCiphertextFFI *sub_2_overflowed; // num_blocks
CudaRadixCiphertextFFI *sub_3_overflowed; // num_blocks
CudaRadixCiphertextFFI *comparison_blocks_1; // num_blocks
CudaRadixCiphertextFFI *comparison_blocks_2; // num_blocks
CudaRadixCiphertextFFI *comparison_blocks_3; // num_blocks
CudaRadixCiphertextFFI *cmp_1; // boolean block
CudaRadixCiphertextFFI *cmp_2; // boolean block
CudaRadixCiphertextFFI *cmp_3; // boolean block
CudaRadixCiphertextFFI *c0; // single block
CudaRadixCiphertextFFI *c1; // single block
CudaRadixCiphertextFFI *c2; // single block
CudaRadixCiphertextFFI *c3; // single block
CudaRadixCiphertextFFI *q1; // single block
CudaRadixCiphertextFFI *q2; // single block
CudaRadixCiphertextFFI *q3; // single block
Torus **first_indexes_for_overflow_sub;
Torus **second_indexes_for_overflow_sub;
Torus **scalars_for_overflow_sub;
uint32_t max_indexes_to_erase;
// allocate and initialize if needed, temporary arrays used to calculate
// cuda integer div_rem_2_2 operation
void init_temporary_buffers(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
uint32_t num_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
// more than one block temporary arrays
d1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], d1, num_blocks + 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
d2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], d2, num_blocks + 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
d3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], d3, num_blocks + 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
low1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], low1, num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
low2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], low2, num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
low3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], low3, num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
rem = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], rem, num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
sub_result_1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_result_1, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_result_2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_result_2, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_result_3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_result_3, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_1_overflowed = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_1_overflowed, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_2_overflowed = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_2_overflowed, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
sub_3_overflowed = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], sub_3_overflowed, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
comparison_blocks_1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], comparison_blocks_1, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
comparison_blocks_2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], comparison_blocks_2, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
comparison_blocks_3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], comparison_blocks_3, num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// boolean blocks or single block temporary arrays
cmp_1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], cmp_1, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
cmp_2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], cmp_2, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
cmp_3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], cmp_3, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
c0 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], c0, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
c1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], c1, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
c2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], c2, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
c3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], c3, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
q1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], q1, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
q2 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], q2, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
q3 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], q3, 1, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
}
// initialize lookup tables for div_rem_2_2 operation
void init_lookup_tables(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
uint32_t num_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
message_extract_lut_1 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
message_extract_lut_2 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
zero_out_if_not_1_lut_1 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
zero_out_if_not_1_lut_2 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
zero_out_if_not_2_lut_1 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
zero_out_if_not_2_lut_2 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_blocks, allocate_gpu_memory, size_tracker);
quotient_lut_1 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
allocate_gpu_memory, size_tracker);
quotient_lut_2 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
allocate_gpu_memory, size_tracker);
quotient_lut_3 =
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
allocate_gpu_memory, size_tracker);
auto message_modulus = params.message_modulus;
auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};
auto zero_out_if_not_1_lut_f = [](Torus x) -> Torus {
Torus block = x / 2;
bool condition = (x & 1) == 1;
return block * (Torus)condition;
};
auto zero_out_if_not_2_lut_f = [](Torus x) -> Torus {
Torus block = x / 3;
bool condition = (x % 3) == 2;
return block * (Torus)condition;
};
auto quotient_lut_1_f = [](Torus cond) -> Torus {
return (Torus)(cond == 2);
};
auto quotient_lut_2_f = [](Torus cond) -> Torus {
return (Torus)((cond == 2) * 2);
};
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
luts[j]->broadcast_lut(streams, gpu_indexes);
}
luts[0] = zero_out_if_not_1_lut_1;
luts[1] = zero_out_if_not_1_lut_2;
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
luts[j]->broadcast_lut(streams, gpu_indexes);
}
luts[0] = zero_out_if_not_2_lut_1;
luts[1] = zero_out_if_not_2_lut_2;
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
luts[j]->broadcast_lut(streams, gpu_indexes);
}
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], quotient_lut_1->get_lut(0, 0),
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
quotient_lut_1->broadcast_lut(streams, gpu_indexes);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], quotient_lut_2->get_lut(0, 0),
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
quotient_lut_2->broadcast_lut(streams, gpu_indexes);
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], quotient_lut_3->get_lut(0, 0),
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
quotient_lut_3->broadcast_lut(streams, gpu_indexes);
}
unsigned_int_div_rem_2_2_memory(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
this->params = params;
uint32_t compute_overflow = 1;
overflow_sub_mem_1 = new int_borrow_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
allocate_gpu_memory, size_tracker);
overflow_sub_mem_2 = new int_borrow_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
allocate_gpu_memory, size_tracker);
overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
allocate_gpu_memory, size_tracker);
uint32_t group_size = overflow_sub_mem_1->group_size;
bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
->use_sequential_algorithm_to_resolve_group_carries;
create_indexes_for_overflow_sub(streams, gpu_indexes, num_blocks,
group_size, use_seq, allocate_gpu_memory,
size_tracker);
comparison_buffer_1 = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
num_blocks, false, allocate_gpu_memory, size_tracker);
comparison_buffer_2 = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
num_blocks, false, allocate_gpu_memory, size_tracker);
comparison_buffer_3 = new int_comparison_buffer<Torus>(
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
num_blocks, false, allocate_gpu_memory, size_tracker);
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks + 1,
outputFlag::FLAG_NONE, allocate_gpu_memory, size_tracker);
bitor_mem_1 = new int_bitop_buffer<Torus>(
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, size_tracker);
bitor_mem_2 = new int_bitop_buffer<Torus>(
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, size_tracker);
bitor_mem_3 = new int_bitop_buffer<Torus>(
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, size_tracker);
shift_mem = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks,
allocate_gpu_memory, size_tracker);
init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks,
allocate_gpu_memory, size_tracker);
sub_streams_1 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_2 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_3 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_4 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_5 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_6 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
sub_streams_7 =
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
for (uint j = 0; j < active_gpu_count; j++) {
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_4[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_5[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_6[j] = cuda_create_stream(gpu_indexes[j]);
sub_streams_7[j] = cuda_create_stream(gpu_indexes[j]);
}
}
void create_indexes_for_overflow_sub(cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t num_blocks, uint32_t group_size,
bool use_seq, bool allocate_gpu_memory,
uint64_t &size_tracker) {
max_indexes_to_erase = num_blocks;
first_indexes_for_overflow_sub =
(Torus **)malloc(num_blocks * sizeof(Torus *));
second_indexes_for_overflow_sub =
(Torus **)malloc(num_blocks * sizeof(Torus *));
scalars_for_overflow_sub = (Torus **)malloc(num_blocks * sizeof(Torus *));
Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));
// Extra indexes for the luts in first step
for (int nb = 1; nb <= num_blocks; nb++) {
first_indexes_for_overflow_sub[nb - 1] =
(Torus *)cuda_malloc_with_size_tracking_async(
nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
allocate_gpu_memory);
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
bool is_last_index = (index == (nb - 1));
if (is_last_index) {
if (nb == 1) {
h_lut_indexes[index] = 2 * group_size;
} else {
h_lut_indexes[index] = 2;
}
} else if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory);
}
// Extra indexes for the luts in second step
for (int nb = 1; nb <= num_blocks; nb++) {
second_indexes_for_overflow_sub[nb - 1] =
(Torus *)cuda_malloc_with_size_tracking_async(
nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
allocate_gpu_memory);
scalars_for_overflow_sub[nb - 1] =
(Torus *)cuda_malloc_with_size_tracking_async(
nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
allocate_gpu_memory);
for (int index = 0; index < nb; index++) {
uint32_t grouping_index = index / group_size;
bool is_in_first_grouping = (grouping_index == 0);
uint32_t index_in_grouping = index % group_size;
if (is_in_first_grouping) {
h_lut_indexes[index] = index_in_grouping;
} else if (index_in_grouping == (group_size - 1)) {
if (use_seq) {
int inner_index = (grouping_index - 1) % (group_size - 1);
h_lut_indexes[index] = inner_index + 2 * group_size;
} else {
h_lut_indexes[index] = 2 * group_size;
}
} else {
h_lut_indexes[index] = index_in_grouping + group_size;
}
bool may_have_its_padding_bit_set =
!is_in_first_grouping && (index_in_grouping == group_size - 1);
if (may_have_its_padding_bit_set) {
if (use_seq) {
h_scalar[index] = 1 << ((grouping_index - 1) % (group_size - 1));
} else {
h_scalar[index] = 1;
}
} else {
h_scalar[index] = 0;
}
}
cuda_memcpy_with_size_tracking_async_to_gpu(
second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory);
cuda_memcpy_with_size_tracking_async_to_gpu(
scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
streams[0], gpu_indexes[0], allocate_gpu_memory);
}
free(h_lut_indexes);
free(h_scalar);
};
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
// release and delete integer ops memory objects
overflow_sub_mem_1->release(streams, gpu_indexes, gpu_count);
overflow_sub_mem_2->release(streams, gpu_indexes, gpu_count);
overflow_sub_mem_3->release(streams, gpu_indexes, gpu_count);
comparison_buffer_1->release(streams, gpu_indexes, gpu_count);
comparison_buffer_2->release(streams, gpu_indexes, gpu_count);
comparison_buffer_3->release(streams, gpu_indexes, gpu_count);
sub_and_propagate_mem->release(streams, gpu_indexes, gpu_count);
bitor_mem_1->release(streams, gpu_indexes, gpu_count);
bitor_mem_2->release(streams, gpu_indexes, gpu_count);
bitor_mem_3->release(streams, gpu_indexes, gpu_count);
shift_mem->release(streams, gpu_indexes, gpu_count);
delete overflow_sub_mem_1;
delete overflow_sub_mem_2;
delete overflow_sub_mem_3;
delete comparison_buffer_1;
delete comparison_buffer_2;
delete comparison_buffer_3;
delete sub_and_propagate_mem;
delete bitor_mem_1;
delete bitor_mem_2;
delete bitor_mem_3;
delete shift_mem;
// release and delete lut objects
message_extract_lut_1->release(streams, gpu_indexes, gpu_count);
message_extract_lut_2->release(streams, gpu_indexes, gpu_count);
zero_out_if_not_1_lut_1->release(streams, gpu_indexes, gpu_count);
zero_out_if_not_1_lut_2->release(streams, gpu_indexes, gpu_count);
zero_out_if_not_2_lut_1->release(streams, gpu_indexes, gpu_count);
zero_out_if_not_2_lut_2->release(streams, gpu_indexes, gpu_count);
quotient_lut_1->release(streams, gpu_indexes, gpu_count);
quotient_lut_2->release(streams, gpu_indexes, gpu_count);
quotient_lut_3->release(streams, gpu_indexes, gpu_count);
delete message_extract_lut_1;
delete message_extract_lut_2;
delete zero_out_if_not_1_lut_1;
delete zero_out_if_not_1_lut_2;
delete zero_out_if_not_2_lut_1;
delete zero_out_if_not_2_lut_2;
delete quotient_lut_1;
delete quotient_lut_2;
delete quotient_lut_3;
// release and delete temporary buffers
release_radix_ciphertext_async(streams[0], gpu_indexes[0], d1,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], d2,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], d3,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], low1,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], low2,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], low3,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], rem,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_1,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_2,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_3,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_1_overflowed,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_2_overflowed,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_3_overflowed,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
comparison_blocks_1, gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
comparison_blocks_2, gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
comparison_blocks_3, gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_1,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_2,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_3,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], c0,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], c1,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], c2,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], c3,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], q1,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], q2,
gpu_memory_allocated);
release_radix_ciphertext_async(streams[0], gpu_indexes[0], q3,
gpu_memory_allocated);
delete d1;
delete d2;
delete d3;
delete low1;
delete low2;
delete low3;
delete rem;
delete sub_result_1;
delete sub_result_2;
delete sub_result_3;
delete sub_1_overflowed;
delete sub_2_overflowed;
delete sub_3_overflowed;
delete comparison_blocks_1;
delete comparison_blocks_2;
delete comparison_blocks_3;
delete cmp_1;
delete cmp_2;
delete cmp_3;
delete c0;
delete c1;
delete c2;
delete c3;
delete q1;
delete q2;
delete q3;
for (int i = 0; i < max_indexes_to_erase; i++) {
cuda_drop_with_size_tracking_async(first_indexes_for_overflow_sub[i],
streams[0], gpu_indexes[0],
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(second_indexes_for_overflow_sub[i],
streams[0], gpu_indexes[0],
gpu_memory_allocated);
cuda_drop_with_size_tracking_async(scalars_for_overflow_sub[i],
streams[0], gpu_indexes[0],
gpu_memory_allocated);
}
free(first_indexes_for_overflow_sub);
free(second_indexes_for_overflow_sub);
free(scalars_for_overflow_sub);
}
};
template <typename Torus> struct unsigned_int_div_rem_memory {
int_radix_params params;
uint32_t active_gpu_count;
@@ -4162,6 +4927,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
int_borrow_prop_memory<Torus> *overflow_sub_mem;
int_comparison_buffer<Torus> *comparison_buffer;
unsigned_int_div_rem_2_2_memory<Torus> *div_rem_2_2_mem;
// lookup tables
int_radix_lut<Torus> **masking_luts_1;
@@ -4209,7 +4975,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
uint32_t const *gpu_indexes, uint32_t gpu_count,
uint32_t num_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
// non boolean temporary arrays, with `num_blocks` blocks
remainder1 = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -4349,7 +5114,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
luts[j]->broadcast_lut(streams, gpu_indexes);
}
// Give name to closures to improve readability
auto overflow_happened = [](uint64_t overflow_sum) {
return overflow_sum != 0;
};
@@ -4458,8 +5222,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
this->params = params;
if (params.message_modulus == 4 && params.carry_modulus == 4) {
div_rem_2_2_mem = new unsigned_int_div_rem_2_2_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_blocks,
allocate_gpu_memory, size_tracker);
return;
}
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
@@ -4602,6 +5373,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
if (params.message_modulus == 4 && params.carry_modulus == 4) {
div_rem_2_2_mem->release(streams, gpu_indexes, gpu_count);
delete div_rem_2_2_mem;
return;
}
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
// release and delete other operation memory objects
@@ -4609,6 +5386,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
shift_mem_2->release(streams, gpu_indexes, gpu_count);
overflow_sub_mem->release(streams, gpu_indexes, gpu_count);
comparison_buffer->release(streams, gpu_indexes, gpu_count);
delete shift_mem_1;
delete shift_mem_2;
delete overflow_sub_mem;
@@ -4750,89 +5528,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
}
};
template <typename Torus> struct int_bitop_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
BITOP_TYPE op;
bool gpu_memory_allocated;
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory,
uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->op = op;
this->params = params;
switch (op) {
case BITAND:
case BITOR:
case BITXOR:
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
num_radix_blocks, allocate_gpu_memory,
size_tracker);
{
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
if (op == BITOP_TYPE::BITAND) {
// AND
return lhs & rhs;
} else if (op == BITOP_TYPE::BITOR) {
// OR
return lhs | rhs;
} else {
// XOR
return lhs ^ rhs;
}
};
generate_device_accumulator_bivariate<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
lut->get_max_degree(0), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
}
break;
default:
// Scalar OP
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
params.message_modulus, num_radix_blocks,
allocate_gpu_memory, size_tracker);
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
if (op == BITOP_TYPE::SCALAR_BITAND) {
// AND
return x & rhs;
} else if (op == BITOP_TYPE::SCALAR_BITOR) {
// OR
return x | rhs;
} else {
// XOR
return x ^ rhs;
}
};
generate_device_accumulator<Torus>(
streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
lut->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_univariate_scalar_f,
gpu_memory_allocated);
lut->broadcast_lut(streams, gpu_indexes);
}
}
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
lut->release(streams, gpu_indexes, gpu_count);
delete lut;
}
};
template <typename Torus> struct int_scalar_mul_buffer {
int_radix_params params;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
@@ -5204,45 +5899,6 @@ template <typename Torus> struct int_scalar_mul_high_buffer {
}
};
template <typename Torus> struct int_sub_and_propagate {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *neg_rhs_array;
int_sc_prop_memory<Torus> *sc_prop_mem;
int_sub_and_propagate(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
const int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
bool allocate_gpu_memory, uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
this->neg_rhs_array = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
}
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
delete sc_prop_mem;
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
allocate_gpu_memory);
delete neg_rhs_array;
}
};
template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
int_radix_params params;

View File

@@ -4,6 +4,7 @@
#include "crypto/keyswitch.cuh"
#include "device.h"
#include "integer/abs.cuh"
#include "integer/cast.cuh"
#include "integer/comparison.cuh"
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
@@ -32,6 +33,356 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
return size_tracker;
}
template <typename Torus>
__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
uint64_t *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
unsigned_int_div_rem_2_2_memory<uint64_t> *mem_ptr) {
// alias
auto radix_params = mem_ptr->params;
auto num_blocks = quotient->num_radix_blocks;
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder,
numerator);
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
quotient, 0, num_blocks);
quotient->num_radix_blocks = 0;
// Computes 2*d by extending and shifting
auto extend_2xd_f = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
// d2 is allocated with num_blocks + 1; so we extend with 1.
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(mem_ptr->d2, divisor,
streams, gpu_indexes);
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->d2, 1, mem_ptr->shift_mem,
bsks, ksks, ms_noise_reduction_key, mem_ptr->d2->num_radix_blocks);
};
// Computes 3*d = 4*d - d using block shift and subtraction
auto extend_3xd_f = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
// d1 is allocated with num_blocks + 1; so we extend with 1.
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(mem_ptr->d1, divisor,
streams, gpu_indexes);
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
mem_ptr->d3, mem_ptr->d1, 1,
mem_ptr->d1->num_radix_blocks);
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->d3, 0, 1);
host_sub_and_propagate_single_carry(
streams, gpu_indexes, gpu_count, mem_ptr->d3, mem_ptr->d1, nullptr,
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
ms_noise_reduction_key, outputFlag::FLAG_NONE, 0);
// trim d1 by one msb block
mem_ptr->d1->num_radix_blocks -= 1;
};
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
extend_2xd_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
extend_3xd_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
uint32_t slice_len = num_blocks - block_index;
mem_ptr->low1->num_radix_blocks = slice_len;
mem_ptr->low2->num_radix_blocks = slice_len;
mem_ptr->low3->num_radix_blocks = slice_len;
mem_ptr->rem->num_radix_blocks = slice_len;
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->low1, 0, slice_len,
mem_ptr->d1, 0, slice_len);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->low2, 0, slice_len,
mem_ptr->d2, 0, slice_len);
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->low3, 0, slice_len,
mem_ptr->d3, 0, slice_len);
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], mem_ptr->rem, 0, slice_len, remainder,
block_index, num_blocks);
uint32_t compute_overflow = 1;
uint32_t uses_input_borrow = 0;
auto first_indexes =
mem_ptr->first_indexes_for_overflow_sub[mem_ptr->rem->num_radix_blocks -
1];
auto second_indexes =
mem_ptr
->second_indexes_for_overflow_sub[mem_ptr->rem->num_radix_blocks -
1];
auto scalar_indexes =
mem_ptr->scalars_for_overflow_sub[mem_ptr->rem->num_radix_blocks - 1];
auto sub_result_f = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *sub_result,
CudaRadixCiphertextFFI *sub_overflowed,
int_borrow_prop_memory<Torus> *overflow_sub_mem,
CudaRadixCiphertextFFI *low) {
sub_result->num_radix_blocks = low->num_radix_blocks;
overflow_sub_mem->update_lut_indexes(streams, gpu_indexes, first_indexes,
second_indexes, scalar_indexes,
mem_ptr->rem->num_radix_blocks);
host_integer_overflowing_sub<uint64_t>(
streams, gpu_indexes, gpu_count, sub_result, mem_ptr->rem, low,
sub_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
compute_overflow, uses_input_borrow);
};
auto cmp_f = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count,
CudaRadixCiphertextFFI *out_boolean_block,
CudaRadixCiphertextFFI *comparison_blocks,
CudaRadixCiphertextFFI *d,
int_comparison_buffer<Torus> *comparison_buffer) {
CudaRadixCiphertextFFI *d_msb = new CudaRadixCiphertextFFI;
uint32_t slice_start = num_blocks - block_index;
uint32_t slice_end = d->num_radix_blocks;
as_radix_ciphertext_slice<Torus>(d_msb, d, slice_start, slice_end);
comparison_blocks->num_radix_blocks = d_msb->num_radix_blocks;
if (d_msb->num_radix_blocks == 0) {
cuda_memset_async((Torus *)out_boolean_block->ptr, 0,
sizeof(Torus) *
(out_boolean_block->lwe_dimension + 1),
streams[0], gpu_indexes[0]);
} else {
host_compare_blocks_with_zero<Torus>(
streams, gpu_indexes, gpu_count, comparison_blocks, d_msb,
comparison_buffer, bsks, ksks, ms_noise_reduction_key,
d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
are_all_comparisons_block_true(
streams, gpu_indexes, gpu_count, out_boolean_block,
comparison_blocks, comparison_buffer, bsks, ksks,
ms_noise_reduction_key, comparison_blocks->num_radix_blocks);
host_negation<Torus>(
streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
(Torus *)out_boolean_block->ptr, radix_params.big_lwe_dimension, 1);
// we calculate encoding because this block works only for
// message_modulus = 4 and carry_modulus = 4.
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
host_addition_plaintext_scalar<Torus>(
streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
(Torus *)out_boolean_block->ptr, encoded_scalar,
radix_params.big_lwe_dimension, 1);
}
delete d_msb;
};
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
sub_result_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
mem_ptr->sub_result_1, mem_ptr->sub_1_overflowed,
mem_ptr->overflow_sub_mem_1, mem_ptr->low3);
sub_result_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
mem_ptr->sub_result_2, mem_ptr->sub_2_overflowed,
mem_ptr->overflow_sub_mem_2, mem_ptr->low2);
sub_result_f(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
mem_ptr->sub_result_3, mem_ptr->sub_3_overflowed,
mem_ptr->overflow_sub_mem_3, mem_ptr->low1);
cmp_f(mem_ptr->sub_streams_4, gpu_indexes, gpu_count, mem_ptr->cmp_1,
mem_ptr->comparison_blocks_1, mem_ptr->d3,
mem_ptr->comparison_buffer_1);
cmp_f(mem_ptr->sub_streams_5, gpu_indexes, gpu_count, mem_ptr->cmp_2,
mem_ptr->comparison_blocks_2, mem_ptr->d2,
mem_ptr->comparison_buffer_2);
cmp_f(mem_ptr->sub_streams_6, gpu_indexes, gpu_count, mem_ptr->cmp_3,
mem_ptr->comparison_blocks_3, mem_ptr->d1,
mem_ptr->comparison_buffer_3);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
}
auto r1 = mem_ptr->sub_result_3;
auto r2 = mem_ptr->sub_result_2;
auto r3 = mem_ptr->sub_result_1;
auto o1 = mem_ptr->sub_3_overflowed;
auto o2 = mem_ptr->sub_2_overflowed;
auto o3 = mem_ptr->sub_1_overflowed;
// used as a bitor
host_integer_radix_bitop_kb(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
o3, o3, mem_ptr->cmp_1, mem_ptr->bitor_mem_1,
bsks, ksks, ms_noise_reduction_key);
// used as a bitor
host_integer_radix_bitop_kb(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
o2, o2, mem_ptr->cmp_2, mem_ptr->bitor_mem_2,
bsks, ksks, ms_noise_reduction_key);
// used as a bitor
host_integer_radix_bitop_kb(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
o1, o1, mem_ptr->cmp_3, mem_ptr->bitor_mem_3,
bsks, ksks, ms_noise_reduction_key);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
}
// The cx variables tell whether the corresponding result of the subtraction
// should be kept, and what value the quotient block should have
//
// for c3, c0; the block values are in [0, 1]
// for c2, c1; the block values are in [0, 1, 2], 2 meaning true; 0,1
// meaning false
// c3 = !o3
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->c3, 0, 1, o3, 0, 1);
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
(Torus *)mem_ptr->c3->ptr,
radix_params.big_lwe_dimension, 1);
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
host_addition_plaintext_scalar<Torus>(
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
(Torus *)mem_ptr->c3->ptr, encoded_scalar,
radix_params.big_lwe_dimension, 1);
// c2 = !o2 + o3
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->c2, 0, 1, o2, 0, 1);
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
(Torus *)mem_ptr->c2->ptr,
radix_params.big_lwe_dimension, 1);
host_addition_plaintext_scalar<Torus>(
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
(Torus *)mem_ptr->c2->ptr, encoded_scalar,
radix_params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c2, mem_ptr->c2,
o3, 1, 4, 4);
// c1 = !o1 + o2
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->c1, 0, 1, o1, 0, 1);
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
(Torus *)mem_ptr->c1->ptr,
radix_params.big_lwe_dimension, 1);
host_addition_plaintext_scalar<Torus>(
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
(Torus *)mem_ptr->c1->ptr, encoded_scalar,
radix_params.big_lwe_dimension, 1);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c1, mem_ptr->c1,
o2, 1, 4, 4);
// c0 = o1 (direct copy)
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->c0, 0, 1, o1, 0, 1);
auto conditional_update = [&](cudaStream_t const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count,
CudaRadixCiphertextFFI *cx,
CudaRadixCiphertextFFI *rx,
int_radix_lut<Torus> *lut, Torus factor) {
auto rx_list = to_lwe_ciphertext_list(rx);
host_cleartext_multiplication<Torus>(streams[0], gpu_indexes[0],
(Torus *)rx->ptr, &rx_list, factor);
host_add_the_same_block_to_all_blocks<Torus>(streams[0], gpu_indexes[0],
rx, rx, cx, 4, 4);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, rx, rx, bsks, ksks,
ms_noise_reduction_key, lut, rx->num_radix_blocks);
};
auto calculate_quotient_bits =
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *q,
CudaRadixCiphertextFFI *c, int_radix_lut<Torus> *lut) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, q, c, bsks, ksks,
ms_noise_reduction_key, lut, 1);
};
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
conditional_update(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
mem_ptr->c3, r3, mem_ptr->zero_out_if_not_1_lut_1, 2);
conditional_update(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
mem_ptr->c2, r2, mem_ptr->zero_out_if_not_2_lut_1, 3);
conditional_update(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
mem_ptr->c1, r1, mem_ptr->zero_out_if_not_2_lut_2, 3);
conditional_update(mem_ptr->sub_streams_4, gpu_indexes, gpu_count,
mem_ptr->c0, mem_ptr->rem,
mem_ptr->zero_out_if_not_1_lut_2, 2);
calculate_quotient_bits(mem_ptr->sub_streams_5, gpu_indexes, 1, mem_ptr->q1,
mem_ptr->c1, mem_ptr->quotient_lut_1);
calculate_quotient_bits(mem_ptr->sub_streams_6, gpu_indexes, 1, mem_ptr->q2,
mem_ptr->c2, mem_ptr->quotient_lut_2);
calculate_quotient_bits(mem_ptr->sub_streams_7, gpu_indexes, 1, mem_ptr->q3,
mem_ptr->c3, mem_ptr->quotient_lut_3);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_7[j], gpu_indexes[j]);
}
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
r3, mem_ptr->rem->num_radix_blocks, 4, 4);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
r2, mem_ptr->rem->num_radix_blocks, 4, 4);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
r1, mem_ptr->rem->num_radix_blocks, 4, 4);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
mem_ptr->q2, 1, 4, 4);
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
mem_ptr->q3, 1, 4, 4);
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, mem_ptr->rem,
mem_ptr->rem, bsks, ksks, ms_noise_reduction_key,
mem_ptr->message_extract_lut_1, mem_ptr->rem->num_radix_blocks);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, mem_ptr->q1,
mem_ptr->q1, bsks, ksks, ms_noise_reduction_key,
mem_ptr->message_extract_lut_2, 1);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
}
size_t tmp_rem_size = mem_ptr->rem->num_radix_blocks;
mem_ptr->rem->num_radix_blocks = remainder->num_radix_blocks;
copy_radix_ciphertext_slice_async<Torus>(
streams[0], gpu_indexes[0], remainder, block_index,
remainder->num_radix_blocks, mem_ptr->rem, 0, tmp_rem_size);
mem_ptr->rem->num_radix_blocks = tmp_rem_size;
insert_block_in_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
mem_ptr->q1, quotient, 0);
}
}
template <typename Torus>
__host__ void host_unsigned_integer_div_rem_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -50,6 +401,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
remainder->lwe_dimension != divisor->lwe_dimension ||
remainder->lwe_dimension != quotient->lwe_dimension)
PANIC("Cuda error: input and output lwe dimension must be equal")
if (mem_ptr->params.message_modulus == 4 &&
mem_ptr->params.carry_modulus == 4) {
host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
divisor, bsks, ksks, ms_noise_reduction_key, mem_ptr->div_rem_2_2_mem);
return;
}
auto radix_params = mem_ptr->params;
auto num_blocks = quotient->num_radix_blocks;

View File

@@ -7,6 +7,12 @@
#include "utils/helper_profile.cuh"
#include "utils/kernel_dimensions.cuh"
inline CudaLweCiphertextListFFI
to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
return {.ptr = radix->ptr,
.num_radix_blocks = radix->num_radix_blocks,
.lwe_dimension = radix->lwe_dimension};
}
template <typename Torus>
void create_zero_radix_ciphertext_async(cudaStream_t const stream,
uint32_t const gpu_index,

View File

@@ -105,134 +105,6 @@ const _: () = {
ms_input_variance
) - 32usize];
};
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CudaLweCiphertextListFFI {
pub ptr: *mut ffi::c_void,
pub num_radix_blocks: u32,
pub lwe_dimension: u32,
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
["Size of CudaLweCiphertextListFFI"]
[::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
["Alignment of CudaLweCiphertextListFFI"]
[::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
["Offset of field: CudaLweCiphertextListFFI::ptr"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
};
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CudaPackedGlweCiphertextListFFI {
pub ptr: *mut ffi::c_void,
pub storage_log_modulus: u32,
pub lwe_per_glwe: u32,
pub total_lwe_bodies_count: u32,
pub glwe_dimension: u32,
pub polynomial_size: u32,
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
["Size of CudaPackedGlweCiphertextListFFI"]
[::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
["Alignment of CudaPackedGlweCiphertextListFFI"]
[::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
};
unsafe extern "C" {
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
compression_glwe_dimension: u32,
compression_polynomial_size: u32,
lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
num_radix_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
lwe_per_glwe: u32,
allocate_gpu_memory: bool,
) -> u64;
}
unsafe extern "C" {
pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
encryption_glwe_dimension: u32,
encryption_polynomial_size: u32,
compression_glwe_dimension: u32,
compression_polynomial_size: u32,
lwe_dimension: u32,
pbs_level: u32,
pbs_base_log: u32,
num_blocks_to_decompress: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
allocate_ms_array: bool,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
lwe_array_in: *const CudaLweCiphertextListFFI,
fp_ksk: *const *mut ffi::c_void,
mem_ptr: *mut i8,
);
}
unsafe extern "C" {
pub fn cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array_out: *mut CudaLweCiphertextListFFI,
glwe_in: *const CudaPackedGlweCiphertextListFFI,
indexes_array: *const u32,
bsks: *const *mut ffi::c_void,
mem_ptr: *mut i8,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0;
pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1;
pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2;
@@ -367,6 +239,55 @@ const _: () = {
divisor_has_more_bits_than_numerator
) - 60usize];
};
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CudaLweCiphertextListFFI {
pub ptr: *mut ffi::c_void,
pub num_radix_blocks: u32,
pub lwe_dimension: u32,
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
["Size of CudaLweCiphertextListFFI"]
[::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
["Alignment of CudaLweCiphertextListFFI"]
[::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
["Offset of field: CudaLweCiphertextListFFI::ptr"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
[::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
};
#[repr(C)]
#[derive(Debug, Copy, Clone)]
pub struct CudaPackedGlweCiphertextListFFI {
pub ptr: *mut ffi::c_void,
pub storage_log_modulus: u32,
pub lwe_per_glwe: u32,
pub total_lwe_bodies_count: u32,
pub glwe_dimension: u32,
pub polynomial_size: u32,
}
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
const _: () = {
["Size of CudaPackedGlweCiphertextListFFI"]
[::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
["Alignment of CudaPackedGlweCiphertextListFFI"]
[::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
};
unsafe extern "C" {
pub fn scratch_cuda_apply_univariate_lut_kb_64(
streams: *const *mut ffi::c_void,
@@ -1934,6 +1855,85 @@ unsafe extern "C" {
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
compression_glwe_dimension: u32,
compression_polynomial_size: u32,
lwe_dimension: u32,
ks_level: u32,
ks_base_log: u32,
num_radix_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
lwe_per_glwe: u32,
allocate_gpu_memory: bool,
) -> u64;
}
unsafe extern "C" {
pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr: *mut *mut i8,
encryption_glwe_dimension: u32,
encryption_polynomial_size: u32,
compression_glwe_dimension: u32,
compression_polynomial_size: u32,
lwe_dimension: u32,
pbs_level: u32,
pbs_base_log: u32,
num_blocks_to_decompress: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: PBS_TYPE,
allocate_gpu_memory: bool,
allocate_ms_array: bool,
) -> u64;
}
unsafe extern "C" {
pub fn cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
lwe_array_in: *const CudaLweCiphertextListFFI,
fp_ksk: *const *mut ffi::c_void,
mem_ptr: *mut i8,
);
}
unsafe extern "C" {
pub fn cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array_out: *mut CudaLweCiphertextListFFI,
glwe_in: *const CudaPackedGlweCiphertextListFFI,
indexes_array: *const u32,
bsks: *const *mut ffi::c_void,
mem_ptr: *mut i8,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
mem_ptr_void: *mut *mut i8,
);
}
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
pub type KS_TYPE = ffi::c_uint;