mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-04-28 03:01:21 -04:00
Compare commits
1 Commits
hw-team/pg
...
bb/feat/di
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
84e43630b4 |
@@ -2,27 +2,7 @@
|
||||
#define CUDA_INTEGER_COMPRESSION_H
|
||||
|
||||
#include "../../pbs/pbs_enums.h"
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t lwe_dimension;
|
||||
} CudaLweCiphertextListFFI;
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
// Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
|
||||
// smaller)
|
||||
// Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
|
||||
// each LWE of the group). In the end the total number of bodies is equal to
|
||||
// the number of input LWE
|
||||
uint32_t total_lwe_bodies_count;
|
||||
uint32_t glwe_dimension;
|
||||
uint32_t polynomial_size;
|
||||
} CudaPackedGlweCiphertextListFFI;
|
||||
|
||||
#include "../integer.h"
|
||||
extern "C" {
|
||||
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
|
||||
@@ -80,6 +80,26 @@ typedef struct {
|
||||
bool const divisor_has_more_bits_than_numerator;
|
||||
} CudaScalarDivisorFFI;
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t num_radix_blocks;
|
||||
uint32_t lwe_dimension;
|
||||
} CudaLweCiphertextListFFI;
|
||||
|
||||
typedef struct {
|
||||
void *ptr;
|
||||
uint32_t storage_log_modulus;
|
||||
uint32_t lwe_per_glwe;
|
||||
// Input LWEs are grouped by groups of `lwe_per_glwe`(the last group may be
|
||||
// smaller)
|
||||
// Each group is then packed into one GLWE with `lwe_per_glwe` bodies (one for
|
||||
// each LWE of the group). In the end the total number of bodies is equal to
|
||||
// the number of input LWE
|
||||
uint32_t total_lwe_bodies_count;
|
||||
uint32_t glwe_dimension;
|
||||
uint32_t polynomial_size;
|
||||
} CudaPackedGlweCiphertextListFFI;
|
||||
|
||||
uint64_t scratch_cuda_apply_univariate_lut_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
|
||||
|
||||
@@ -4153,6 +4153,771 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_sub_and_propagate {
|
||||
int_radix_params params;
|
||||
bool allocate_gpu_memory;
|
||||
|
||||
CudaRadixCiphertextFFI *neg_rhs_array;
|
||||
|
||||
int_sc_prop_memory<Torus> *sc_prop_mem;
|
||||
|
||||
int_sub_and_propagate(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
const int_radix_params params,
|
||||
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
|
||||
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
|
||||
|
||||
this->neg_rhs_array = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
|
||||
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete sc_prop_mem;
|
||||
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
|
||||
allocate_gpu_memory);
|
||||
delete neg_rhs_array;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bitop_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
BITOP_TYPE op;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->op = op;
|
||||
this->params = params;
|
||||
|
||||
switch (op) {
|
||||
case BITAND:
|
||||
case BITOR:
|
||||
case BITXOR:
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
{
|
||||
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
|
||||
if (op == BITOP_TYPE::BITAND) {
|
||||
// AND
|
||||
return lhs & rhs;
|
||||
} else if (op == BITOP_TYPE::BITOR) {
|
||||
// OR
|
||||
return lhs | rhs;
|
||||
} else {
|
||||
// XOR
|
||||
return lhs ^ rhs;
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Scalar OP
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
params.message_modulus, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
for (int i = 0; i < params.message_modulus; i++) {
|
||||
auto rhs = i;
|
||||
|
||||
auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
|
||||
if (op == BITOP_TYPE::SCALAR_BITAND) {
|
||||
// AND
|
||||
return x & rhs;
|
||||
} else if (op == BITOP_TYPE::SCALAR_BITOR) {
|
||||
// OR
|
||||
return x | rhs;
|
||||
} else {
|
||||
// XOR
|
||||
return x ^ rhs;
|
||||
}
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
|
||||
lut->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_univariate_scalar_f,
|
||||
gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete lut;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
int_radix_params params;
|
||||
uint32_t active_gpu_count;
|
||||
|
||||
// memory objects for other operations
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem_1;
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem_2;
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem_3;
|
||||
int_comparison_buffer<Torus> *comparison_buffer_1;
|
||||
int_comparison_buffer<Torus> *comparison_buffer_2;
|
||||
int_comparison_buffer<Torus> *comparison_buffer_3;
|
||||
int_sub_and_propagate<Torus> *sub_and_propagate_mem;
|
||||
int_bitop_buffer<Torus> *bitor_mem_1;
|
||||
int_bitop_buffer<Torus> *bitor_mem_2;
|
||||
int_bitop_buffer<Torus> *bitor_mem_3;
|
||||
int_logical_scalar_shift_buffer<Torus> *shift_mem;
|
||||
|
||||
// lookup tables
|
||||
int_radix_lut<Torus> *message_extract_lut_1;
|
||||
int_radix_lut<Torus> *message_extract_lut_2;
|
||||
int_radix_lut<Torus> *zero_out_if_not_1_lut_1;
|
||||
int_radix_lut<Torus> *zero_out_if_not_1_lut_2;
|
||||
int_radix_lut<Torus> *zero_out_if_not_2_lut_1;
|
||||
int_radix_lut<Torus> *zero_out_if_not_2_lut_2;
|
||||
int_radix_lut<Torus> *quotient_lut_1;
|
||||
int_radix_lut<Torus> *quotient_lut_2;
|
||||
int_radix_lut<Torus> *quotient_lut_3;
|
||||
|
||||
// sub streams
|
||||
cudaStream_t *sub_streams_1;
|
||||
cudaStream_t *sub_streams_2;
|
||||
cudaStream_t *sub_streams_3;
|
||||
cudaStream_t *sub_streams_4;
|
||||
cudaStream_t *sub_streams_5;
|
||||
cudaStream_t *sub_streams_6;
|
||||
cudaStream_t *sub_streams_7;
|
||||
|
||||
// temporary device buffers
|
||||
CudaRadixCiphertextFFI *d1; // num_blocks + 1
|
||||
CudaRadixCiphertextFFI *d2; // num_blocks + 1
|
||||
CudaRadixCiphertextFFI *d3; // num_blocks + 1
|
||||
CudaRadixCiphertextFFI *low1; // num_blocks
|
||||
CudaRadixCiphertextFFI *low2; // num_blocks
|
||||
CudaRadixCiphertextFFI *low3; // num_blocks
|
||||
CudaRadixCiphertextFFI *rem; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_result_1; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_result_2; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_result_3; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_1_overflowed; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_2_overflowed; // num_blocks
|
||||
CudaRadixCiphertextFFI *sub_3_overflowed; // num_blocks
|
||||
CudaRadixCiphertextFFI *comparison_blocks_1; // num_blocks
|
||||
CudaRadixCiphertextFFI *comparison_blocks_2; // num_blocks
|
||||
CudaRadixCiphertextFFI *comparison_blocks_3; // num_blocks
|
||||
CudaRadixCiphertextFFI *cmp_1; // boolean block
|
||||
CudaRadixCiphertextFFI *cmp_2; // boolean block
|
||||
CudaRadixCiphertextFFI *cmp_3; // boolean block
|
||||
CudaRadixCiphertextFFI *c0; // single block
|
||||
CudaRadixCiphertextFFI *c1; // single block
|
||||
CudaRadixCiphertextFFI *c2; // single block
|
||||
CudaRadixCiphertextFFI *c3; // single block
|
||||
CudaRadixCiphertextFFI *q1; // single block
|
||||
CudaRadixCiphertextFFI *q2; // single block
|
||||
CudaRadixCiphertextFFI *q3; // single block
|
||||
|
||||
Torus **first_indexes_for_overflow_sub;
|
||||
Torus **second_indexes_for_overflow_sub;
|
||||
Torus **scalars_for_overflow_sub;
|
||||
uint32_t max_indexes_to_erase;
|
||||
|
||||
// allocate and initialize if needed, temporary arrays used to calculate
|
||||
// cuda integer div_rem_2_2 operation
|
||||
void init_temporary_buffers(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
// more than one block temporary arrays
|
||||
d1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], d1, num_blocks + 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], d2, num_blocks + 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], d3, num_blocks + 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
low1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], low1, num_blocks, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
low2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], low2, num_blocks, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
low3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], low3, num_blocks, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
rem = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], rem, num_blocks, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_result_1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_result_1, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_result_2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_result_2, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_result_3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_result_3, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_1_overflowed = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_1_overflowed, 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_2_overflowed = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_2_overflowed, 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
sub_3_overflowed = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], sub_3_overflowed, 1,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
comparison_blocks_1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], comparison_blocks_1, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
comparison_blocks_2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], comparison_blocks_2, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
comparison_blocks_3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], comparison_blocks_3, num_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
// boolean blocks or single block temporary arrays
|
||||
cmp_1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], cmp_1, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
cmp_2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], cmp_2, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
cmp_3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], cmp_3, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
c0 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], c0, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
c1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], c1, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
c2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], c2, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
c3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], c3, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
q1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], q1, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
q2 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], q2, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
q3 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], q3, 1, params.big_lwe_dimension,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
// initialize lookup tables for div_rem_2_2 operation
|
||||
void init_lookup_tables(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
message_extract_lut_1 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
message_extract_lut_2 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
zero_out_if_not_1_lut_1 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
zero_out_if_not_1_lut_2 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
zero_out_if_not_2_lut_1 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
zero_out_if_not_2_lut_2 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_blocks, allocate_gpu_memory, size_tracker);
|
||||
quotient_lut_1 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
quotient_lut_2 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
quotient_lut_3 =
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1, 1,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto message_modulus = params.message_modulus;
|
||||
auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
|
||||
auto zero_out_if_not_1_lut_f = [](Torus x) -> Torus {
|
||||
Torus block = x / 2;
|
||||
bool condition = (x & 1) == 1;
|
||||
return block * (Torus)condition;
|
||||
};
|
||||
auto zero_out_if_not_2_lut_f = [](Torus x) -> Torus {
|
||||
Torus block = x / 3;
|
||||
bool condition = (x % 3) == 2;
|
||||
return block * (Torus)condition;
|
||||
};
|
||||
auto quotient_lut_1_f = [](Torus cond) -> Torus {
|
||||
return (Torus)(cond == 2);
|
||||
};
|
||||
auto quotient_lut_2_f = [](Torus cond) -> Torus {
|
||||
return (Torus)((cond == 2) * 2);
|
||||
};
|
||||
auto quotient_lut_3_f = [](Torus cond) -> Torus { return cond * 3; };
|
||||
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
|
||||
message_extract_lut_2};
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
luts[0] = zero_out_if_not_1_lut_1;
|
||||
luts[1] = zero_out_if_not_1_lut_2;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, zero_out_if_not_1_lut_f, gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
luts[0] = zero_out_if_not_2_lut_1;
|
||||
luts[1] = zero_out_if_not_2_lut_2;
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], quotient_lut_1->get_lut(0, 0),
|
||||
quotient_lut_1->get_degree(0), quotient_lut_1->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_1_f, gpu_memory_allocated);
|
||||
quotient_lut_1->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], quotient_lut_2->get_lut(0, 0),
|
||||
quotient_lut_2->get_degree(0), quotient_lut_2->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_2_f, gpu_memory_allocated);
|
||||
quotient_lut_2->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], quotient_lut_3->get_lut(0, 0),
|
||||
quotient_lut_3->get_degree(0), quotient_lut_3->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
|
||||
quotient_lut_3->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
unsigned_int_div_rem_2_2_memory(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
|
||||
this->params = params;
|
||||
|
||||
uint32_t compute_overflow = 1;
|
||||
overflow_sub_mem_1 = new int_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
overflow_sub_mem_2 = new int_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
uint32_t group_size = overflow_sub_mem_1->group_size;
|
||||
bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
|
||||
->use_sequential_algorithm_to_resolve_group_carries;
|
||||
create_indexes_for_overflow_sub(streams, gpu_indexes, num_blocks,
|
||||
group_size, use_seq, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
comparison_buffer_1 = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
|
||||
num_blocks, false, allocate_gpu_memory, size_tracker);
|
||||
comparison_buffer_2 = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
|
||||
num_blocks, false, allocate_gpu_memory, size_tracker);
|
||||
comparison_buffer_3 = new int_comparison_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
|
||||
num_blocks, false, allocate_gpu_memory, size_tracker);
|
||||
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks + 1,
|
||||
outputFlag::FLAG_NONE, allocate_gpu_memory, size_tracker);
|
||||
bitor_mem_1 = new int_bitop_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
bitor_mem_2 = new int_bitop_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
bitor_mem_3 = new int_bitop_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITOR, params, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
shift_mem = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
|
||||
params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
|
||||
|
||||
init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
sub_streams_1 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_2 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_3 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_4 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_5 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_6 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
sub_streams_7 =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_4[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_5[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_6[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
sub_streams_7[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
}
|
||||
|
||||
void create_indexes_for_overflow_sub(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t num_blocks, uint32_t group_size,
|
||||
bool use_seq, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
max_indexes_to_erase = num_blocks;
|
||||
|
||||
first_indexes_for_overflow_sub =
|
||||
(Torus **)malloc(num_blocks * sizeof(Torus *));
|
||||
second_indexes_for_overflow_sub =
|
||||
(Torus **)malloc(num_blocks * sizeof(Torus *));
|
||||
scalars_for_overflow_sub = (Torus **)malloc(num_blocks * sizeof(Torus *));
|
||||
|
||||
Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
|
||||
Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));
|
||||
|
||||
// Extra indexes for the luts in first step
|
||||
for (int nb = 1; nb <= num_blocks; nb++) {
|
||||
first_indexes_for_overflow_sub[nb - 1] =
|
||||
(Torus *)cuda_malloc_with_size_tracking_async(
|
||||
nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
|
||||
allocate_gpu_memory);
|
||||
for (int index = 0; index < nb; index++) {
|
||||
uint32_t grouping_index = index / group_size;
|
||||
bool is_in_first_grouping = (grouping_index == 0);
|
||||
uint32_t index_in_grouping = index % group_size;
|
||||
bool is_last_index = (index == (nb - 1));
|
||||
if (is_last_index) {
|
||||
if (nb == 1) {
|
||||
h_lut_indexes[index] = 2 * group_size;
|
||||
} else {
|
||||
h_lut_indexes[index] = 2;
|
||||
}
|
||||
} else if (is_in_first_grouping) {
|
||||
h_lut_indexes[index] = index_in_grouping;
|
||||
} else {
|
||||
h_lut_indexes[index] = index_in_grouping + group_size;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
|
||||
nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
}
|
||||
// Extra indexes for the luts in second step
|
||||
for (int nb = 1; nb <= num_blocks; nb++) {
|
||||
second_indexes_for_overflow_sub[nb - 1] =
|
||||
(Torus *)cuda_malloc_with_size_tracking_async(
|
||||
nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
|
||||
allocate_gpu_memory);
|
||||
scalars_for_overflow_sub[nb - 1] =
|
||||
(Torus *)cuda_malloc_with_size_tracking_async(
|
||||
nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
for (int index = 0; index < nb; index++) {
|
||||
uint32_t grouping_index = index / group_size;
|
||||
bool is_in_first_grouping = (grouping_index == 0);
|
||||
uint32_t index_in_grouping = index % group_size;
|
||||
|
||||
if (is_in_first_grouping) {
|
||||
h_lut_indexes[index] = index_in_grouping;
|
||||
} else if (index_in_grouping == (group_size - 1)) {
|
||||
if (use_seq) {
|
||||
int inner_index = (grouping_index - 1) % (group_size - 1);
|
||||
h_lut_indexes[index] = inner_index + 2 * group_size;
|
||||
} else {
|
||||
h_lut_indexes[index] = 2 * group_size;
|
||||
}
|
||||
} else {
|
||||
h_lut_indexes[index] = index_in_grouping + group_size;
|
||||
}
|
||||
|
||||
bool may_have_its_padding_bit_set =
|
||||
!is_in_first_grouping && (index_in_grouping == group_size - 1);
|
||||
|
||||
if (may_have_its_padding_bit_set) {
|
||||
if (use_seq) {
|
||||
h_scalar[index] = 1 << ((grouping_index - 1) % (group_size - 1));
|
||||
} else {
|
||||
h_scalar[index] = 1;
|
||||
}
|
||||
} else {
|
||||
h_scalar[index] = 0;
|
||||
}
|
||||
}
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
|
||||
nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
}
|
||||
free(h_lut_indexes);
|
||||
free(h_scalar);
|
||||
};
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
// release and delete integer ops memory objects
|
||||
overflow_sub_mem_1->release(streams, gpu_indexes, gpu_count);
|
||||
overflow_sub_mem_2->release(streams, gpu_indexes, gpu_count);
|
||||
overflow_sub_mem_3->release(streams, gpu_indexes, gpu_count);
|
||||
comparison_buffer_1->release(streams, gpu_indexes, gpu_count);
|
||||
comparison_buffer_2->release(streams, gpu_indexes, gpu_count);
|
||||
comparison_buffer_3->release(streams, gpu_indexes, gpu_count);
|
||||
sub_and_propagate_mem->release(streams, gpu_indexes, gpu_count);
|
||||
bitor_mem_1->release(streams, gpu_indexes, gpu_count);
|
||||
bitor_mem_2->release(streams, gpu_indexes, gpu_count);
|
||||
bitor_mem_3->release(streams, gpu_indexes, gpu_count);
|
||||
shift_mem->release(streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete overflow_sub_mem_1;
|
||||
delete overflow_sub_mem_2;
|
||||
delete overflow_sub_mem_3;
|
||||
delete comparison_buffer_1;
|
||||
delete comparison_buffer_2;
|
||||
delete comparison_buffer_3;
|
||||
delete sub_and_propagate_mem;
|
||||
delete bitor_mem_1;
|
||||
delete bitor_mem_2;
|
||||
delete bitor_mem_3;
|
||||
delete shift_mem;
|
||||
|
||||
// release and delete lut objects
|
||||
message_extract_lut_1->release(streams, gpu_indexes, gpu_count);
|
||||
message_extract_lut_2->release(streams, gpu_indexes, gpu_count);
|
||||
zero_out_if_not_1_lut_1->release(streams, gpu_indexes, gpu_count);
|
||||
zero_out_if_not_1_lut_2->release(streams, gpu_indexes, gpu_count);
|
||||
zero_out_if_not_2_lut_1->release(streams, gpu_indexes, gpu_count);
|
||||
zero_out_if_not_2_lut_2->release(streams, gpu_indexes, gpu_count);
|
||||
quotient_lut_1->release(streams, gpu_indexes, gpu_count);
|
||||
quotient_lut_2->release(streams, gpu_indexes, gpu_count);
|
||||
quotient_lut_3->release(streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete message_extract_lut_1;
|
||||
delete message_extract_lut_2;
|
||||
delete zero_out_if_not_1_lut_1;
|
||||
delete zero_out_if_not_1_lut_2;
|
||||
delete zero_out_if_not_2_lut_1;
|
||||
delete zero_out_if_not_2_lut_2;
|
||||
delete quotient_lut_1;
|
||||
delete quotient_lut_2;
|
||||
delete quotient_lut_3;
|
||||
|
||||
// release and delete temporary buffers
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], d1,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], d2,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], d3,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], low1,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], low2,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], low3,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], rem,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_1,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_2,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_result_3,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_1_overflowed,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_2_overflowed,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], sub_3_overflowed,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
comparison_blocks_1, gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
comparison_blocks_2, gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
comparison_blocks_3, gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_1,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_2,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], cmp_3,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], c0,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], c1,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], c2,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], c3,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], q1,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], q2,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], q3,
|
||||
gpu_memory_allocated);
|
||||
|
||||
delete d1;
|
||||
delete d2;
|
||||
delete d3;
|
||||
delete low1;
|
||||
delete low2;
|
||||
delete low3;
|
||||
delete rem;
|
||||
delete sub_result_1;
|
||||
delete sub_result_2;
|
||||
delete sub_result_3;
|
||||
delete sub_1_overflowed;
|
||||
delete sub_2_overflowed;
|
||||
delete sub_3_overflowed;
|
||||
delete comparison_blocks_1;
|
||||
delete comparison_blocks_2;
|
||||
delete comparison_blocks_3;
|
||||
delete cmp_1;
|
||||
delete cmp_2;
|
||||
delete cmp_3;
|
||||
delete c0;
|
||||
delete c1;
|
||||
delete c2;
|
||||
delete c3;
|
||||
delete q1;
|
||||
delete q2;
|
||||
delete q3;
|
||||
|
||||
for (int i = 0; i < max_indexes_to_erase; i++) {
|
||||
cuda_drop_with_size_tracking_async(first_indexes_for_overflow_sub[i],
|
||||
streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(second_indexes_for_overflow_sub[i],
|
||||
streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(scalars_for_overflow_sub[i],
|
||||
streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
free(first_indexes_for_overflow_sub);
|
||||
free(second_indexes_for_overflow_sub);
|
||||
free(scalars_for_overflow_sub);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
int_radix_params params;
|
||||
uint32_t active_gpu_count;
|
||||
@@ -4162,6 +4927,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem;
|
||||
int_comparison_buffer<Torus> *comparison_buffer;
|
||||
unsigned_int_div_rem_2_2_memory<Torus> *div_rem_2_2_mem;
|
||||
|
||||
// lookup tables
|
||||
int_radix_lut<Torus> **masking_luts_1;
|
||||
@@ -4209,7 +4975,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
uint32_t num_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
|
||||
// non boolean temporary arrays, with `num_blocks` blocks
|
||||
remainder1 = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -4349,7 +5114,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
// Give name to closures to improve readability
|
||||
auto overflow_happened = [](uint64_t overflow_sum) {
|
||||
return overflow_sum != 0;
|
||||
};
|
||||
@@ -4458,8 +5222,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
active_gpu_count = get_active_gpu_count(2 * num_blocks, gpu_count);
|
||||
|
||||
this->params = params;
|
||||
|
||||
if (params.message_modulus == 4 && params.carry_modulus == 4) {
|
||||
div_rem_2_2_mem = new unsigned_int_div_rem_2_2_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
return;
|
||||
}
|
||||
|
||||
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
|
||||
params, 2 * num_blocks, allocate_gpu_memory, size_tracker);
|
||||
@@ -4602,6 +5373,12 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
|
||||
if (params.message_modulus == 4 && params.carry_modulus == 4) {
|
||||
div_rem_2_2_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete div_rem_2_2_mem;
|
||||
return;
|
||||
}
|
||||
uint32_t num_bits_in_message = 31 - __builtin_clz(params.message_modulus);
|
||||
|
||||
// release and delete other operation memory objects
|
||||
@@ -4609,6 +5386,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
shift_mem_2->release(streams, gpu_indexes, gpu_count);
|
||||
overflow_sub_mem->release(streams, gpu_indexes, gpu_count);
|
||||
comparison_buffer->release(streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete shift_mem_1;
|
||||
delete shift_mem_2;
|
||||
delete overflow_sub_mem;
|
||||
@@ -4750,89 +5528,6 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bitop_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
BITOP_TYPE op;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
int_bitop_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, BITOP_TYPE op, int_radix_params params,
|
||||
uint32_t num_radix_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->op = op;
|
||||
this->params = params;
|
||||
|
||||
switch (op) {
|
||||
case BITAND:
|
||||
case BITOR:
|
||||
case BITXOR:
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 1,
|
||||
num_radix_blocks, allocate_gpu_memory,
|
||||
size_tracker);
|
||||
{
|
||||
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
|
||||
if (op == BITOP_TYPE::BITAND) {
|
||||
// AND
|
||||
return lhs & rhs;
|
||||
} else if (op == BITOP_TYPE::BITOR) {
|
||||
// OR
|
||||
return lhs | rhs;
|
||||
} else {
|
||||
// XOR
|
||||
return lhs ^ rhs;
|
||||
}
|
||||
};
|
||||
|
||||
generate_device_accumulator_bivariate<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// Scalar OP
|
||||
lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params,
|
||||
params.message_modulus, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
for (int i = 0; i < params.message_modulus; i++) {
|
||||
auto rhs = i;
|
||||
|
||||
auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus {
|
||||
if (op == BITOP_TYPE::SCALAR_BITAND) {
|
||||
// AND
|
||||
return x & rhs;
|
||||
} else if (op == BITOP_TYPE::SCALAR_BITOR) {
|
||||
// OR
|
||||
return x | rhs;
|
||||
} else {
|
||||
// XOR
|
||||
return x ^ rhs;
|
||||
}
|
||||
};
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i),
|
||||
lut->get_max_degree(i), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_univariate_scalar_f,
|
||||
gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
lut->release(streams, gpu_indexes, gpu_count);
|
||||
delete lut;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_scalar_mul_buffer {
|
||||
int_radix_params params;
|
||||
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
|
||||
@@ -5204,45 +5899,6 @@ template <typename Torus> struct int_scalar_mul_high_buffer {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_sub_and_propagate {
|
||||
int_radix_params params;
|
||||
bool allocate_gpu_memory;
|
||||
|
||||
CudaRadixCiphertextFFI *neg_rhs_array;
|
||||
|
||||
int_sc_prop_memory<Torus> *sc_prop_mem;
|
||||
|
||||
int_sub_and_propagate(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
const int_radix_params params,
|
||||
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
|
||||
this->params = params;
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
|
||||
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
requested_flag_in, (uint32_t)0, allocate_gpu_memory, size_tracker);
|
||||
|
||||
this->neg_rhs_array = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], neg_rhs_array, num_radix_blocks,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
|
||||
sc_prop_mem->release(streams, gpu_indexes, gpu_count);
|
||||
delete sc_prop_mem;
|
||||
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], neg_rhs_array,
|
||||
allocate_gpu_memory);
|
||||
delete neg_rhs_array;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
|
||||
|
||||
int_radix_params params;
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#include "crypto/keyswitch.cuh"
|
||||
#include "device.h"
|
||||
#include "integer/abs.cuh"
|
||||
#include "integer/cast.cuh"
|
||||
#include "integer/comparison.cuh"
|
||||
#include "integer/integer.cuh"
|
||||
#include "integer/integer_utilities.h"
|
||||
@@ -32,6 +33,356 @@ __host__ uint64_t scratch_cuda_integer_div_rem_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *quotient,
|
||||
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
|
||||
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
|
||||
uint64_t *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
unsigned_int_div_rem_2_2_memory<uint64_t> *mem_ptr) {
|
||||
|
||||
// alias
|
||||
auto radix_params = mem_ptr->params;
|
||||
auto num_blocks = quotient->num_radix_blocks;
|
||||
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], remainder,
|
||||
numerator);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
quotient, 0, num_blocks);
|
||||
quotient->num_radix_blocks = 0;
|
||||
// Computes 2*d by extending and shifting
|
||||
auto extend_2xd_f = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count) {
|
||||
// d2 is allocated with num_blocks + 1; so we extend with 1.
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(mem_ptr->d2, divisor,
|
||||
streams, gpu_indexes);
|
||||
host_integer_radix_logical_scalar_shift_kb_inplace<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->d2, 1, mem_ptr->shift_mem,
|
||||
bsks, ksks, ms_noise_reduction_key, mem_ptr->d2->num_radix_blocks);
|
||||
};
|
||||
|
||||
// Computes 3*d = 4*d - d using block shift and subtraction
|
||||
auto extend_3xd_f = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count) {
|
||||
// d1 is allocated with num_blocks + 1; so we extend with 1.
|
||||
host_extend_radix_with_trivial_zero_blocks_msb<Torus>(mem_ptr->d1, divisor,
|
||||
streams, gpu_indexes);
|
||||
host_radix_blocks_rotate_right<Torus>(streams, gpu_indexes, gpu_count,
|
||||
mem_ptr->d3, mem_ptr->d1, 1,
|
||||
mem_ptr->d1->num_radix_blocks);
|
||||
set_zero_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->d3, 0, 1);
|
||||
host_sub_and_propagate_single_carry(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->d3, mem_ptr->d1, nullptr,
|
||||
nullptr, mem_ptr->sub_and_propagate_mem, bsks, ksks,
|
||||
ms_noise_reduction_key, outputFlag::FLAG_NONE, 0);
|
||||
// trim d1 by one msb block
|
||||
mem_ptr->d1->num_radix_blocks -= 1;
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
extend_2xd_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count);
|
||||
extend_3xd_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count);
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
for (int block_index = num_blocks - 1; block_index >= 0; block_index--) {
|
||||
uint32_t slice_len = num_blocks - block_index;
|
||||
|
||||
mem_ptr->low1->num_radix_blocks = slice_len;
|
||||
mem_ptr->low2->num_radix_blocks = slice_len;
|
||||
mem_ptr->low3->num_radix_blocks = slice_len;
|
||||
mem_ptr->rem->num_radix_blocks = slice_len;
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->low1, 0, slice_len,
|
||||
mem_ptr->d1, 0, slice_len);
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->low2, 0, slice_len,
|
||||
mem_ptr->d2, 0, slice_len);
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->low3, 0, slice_len,
|
||||
mem_ptr->d3, 0, slice_len);
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], mem_ptr->rem, 0, slice_len, remainder,
|
||||
block_index, num_blocks);
|
||||
uint32_t compute_overflow = 1;
|
||||
uint32_t uses_input_borrow = 0;
|
||||
auto first_indexes =
|
||||
mem_ptr->first_indexes_for_overflow_sub[mem_ptr->rem->num_radix_blocks -
|
||||
1];
|
||||
auto second_indexes =
|
||||
mem_ptr
|
||||
->second_indexes_for_overflow_sub[mem_ptr->rem->num_radix_blocks -
|
||||
1];
|
||||
auto scalar_indexes =
|
||||
mem_ptr->scalars_for_overflow_sub[mem_ptr->rem->num_radix_blocks - 1];
|
||||
auto sub_result_f = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *sub_result,
|
||||
CudaRadixCiphertextFFI *sub_overflowed,
|
||||
int_borrow_prop_memory<Torus> *overflow_sub_mem,
|
||||
CudaRadixCiphertextFFI *low) {
|
||||
sub_result->num_radix_blocks = low->num_radix_blocks;
|
||||
overflow_sub_mem->update_lut_indexes(streams, gpu_indexes, first_indexes,
|
||||
second_indexes, scalar_indexes,
|
||||
mem_ptr->rem->num_radix_blocks);
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
streams, gpu_indexes, gpu_count, sub_result, mem_ptr->rem, low,
|
||||
sub_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
|
||||
overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
|
||||
compute_overflow, uses_input_borrow);
|
||||
};
|
||||
|
||||
auto cmp_f = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *out_boolean_block,
|
||||
CudaRadixCiphertextFFI *comparison_blocks,
|
||||
CudaRadixCiphertextFFI *d,
|
||||
int_comparison_buffer<Torus> *comparison_buffer) {
|
||||
CudaRadixCiphertextFFI *d_msb = new CudaRadixCiphertextFFI;
|
||||
uint32_t slice_start = num_blocks - block_index;
|
||||
uint32_t slice_end = d->num_radix_blocks;
|
||||
as_radix_ciphertext_slice<Torus>(d_msb, d, slice_start, slice_end);
|
||||
comparison_blocks->num_radix_blocks = d_msb->num_radix_blocks;
|
||||
if (d_msb->num_radix_blocks == 0) {
|
||||
cuda_memset_async((Torus *)out_boolean_block->ptr, 0,
|
||||
sizeof(Torus) *
|
||||
(out_boolean_block->lwe_dimension + 1),
|
||||
streams[0], gpu_indexes[0]);
|
||||
} else {
|
||||
host_compare_blocks_with_zero<Torus>(
|
||||
streams, gpu_indexes, gpu_count, comparison_blocks, d_msb,
|
||||
comparison_buffer, bsks, ksks, ms_noise_reduction_key,
|
||||
d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
|
||||
are_all_comparisons_block_true(
|
||||
streams, gpu_indexes, gpu_count, out_boolean_block,
|
||||
comparison_blocks, comparison_buffer, bsks, ksks,
|
||||
ms_noise_reduction_key, comparison_blocks->num_radix_blocks);
|
||||
|
||||
host_negation<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
|
||||
(Torus *)out_boolean_block->ptr, radix_params.big_lwe_dimension, 1);
|
||||
// we calculate encoding because this block works only for
|
||||
// message_modulus = 4 and carry_modulus = 4.
|
||||
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
|
||||
(Torus *)out_boolean_block->ptr, encoded_scalar,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
}
|
||||
delete d_msb;
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
sub_result_f(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
mem_ptr->sub_result_1, mem_ptr->sub_1_overflowed,
|
||||
mem_ptr->overflow_sub_mem_1, mem_ptr->low3);
|
||||
sub_result_f(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
mem_ptr->sub_result_2, mem_ptr->sub_2_overflowed,
|
||||
mem_ptr->overflow_sub_mem_2, mem_ptr->low2);
|
||||
sub_result_f(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
|
||||
mem_ptr->sub_result_3, mem_ptr->sub_3_overflowed,
|
||||
mem_ptr->overflow_sub_mem_3, mem_ptr->low1);
|
||||
cmp_f(mem_ptr->sub_streams_4, gpu_indexes, gpu_count, mem_ptr->cmp_1,
|
||||
mem_ptr->comparison_blocks_1, mem_ptr->d3,
|
||||
mem_ptr->comparison_buffer_1);
|
||||
cmp_f(mem_ptr->sub_streams_5, gpu_indexes, gpu_count, mem_ptr->cmp_2,
|
||||
mem_ptr->comparison_blocks_2, mem_ptr->d2,
|
||||
mem_ptr->comparison_buffer_2);
|
||||
cmp_f(mem_ptr->sub_streams_6, gpu_indexes, gpu_count, mem_ptr->cmp_3,
|
||||
mem_ptr->comparison_blocks_3, mem_ptr->d1,
|
||||
mem_ptr->comparison_buffer_3);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
auto r1 = mem_ptr->sub_result_3;
|
||||
auto r2 = mem_ptr->sub_result_2;
|
||||
auto r3 = mem_ptr->sub_result_1;
|
||||
|
||||
auto o1 = mem_ptr->sub_3_overflowed;
|
||||
auto o2 = mem_ptr->sub_2_overflowed;
|
||||
auto o3 = mem_ptr->sub_1_overflowed;
|
||||
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
o3, o3, mem_ptr->cmp_1, mem_ptr->bitor_mem_1,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
o2, o2, mem_ptr->cmp_2, mem_ptr->bitor_mem_2,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
// used as a bitor
|
||||
host_integer_radix_bitop_kb(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
|
||||
o1, o1, mem_ptr->cmp_3, mem_ptr->bitor_mem_3,
|
||||
bsks, ksks, ms_noise_reduction_key);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
// The cx variables tell whether the corresponding result of the subtraction
|
||||
// should be kept, and what value the quotient block should have
|
||||
//
|
||||
// for c3, c0; the block values are in [0, 1]
|
||||
// for c2, c1; the block values are in [0, 1, 2], 2 meaning true; 0,1
|
||||
// meaning false
|
||||
|
||||
// c3 = !o3
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->c3, 0, 1, o3, 0, 1);
|
||||
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
|
||||
(Torus *)mem_ptr->c3->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c3->ptr,
|
||||
(Torus *)mem_ptr->c3->ptr, encoded_scalar,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
// c2 = !o2 + o3
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->c2, 0, 1, o2, 0, 1);
|
||||
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
|
||||
(Torus *)mem_ptr->c2->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c2->ptr,
|
||||
(Torus *)mem_ptr->c2->ptr, encoded_scalar,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c2, mem_ptr->c2,
|
||||
o3, 1, 4, 4);
|
||||
|
||||
// c1 = !o1 + o2
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->c1, 0, 1, o1, 0, 1);
|
||||
host_negation<Torus>(streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
|
||||
(Torus *)mem_ptr->c1->ptr,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition_plaintext_scalar<Torus>(
|
||||
streams[0], gpu_indexes[0], (Torus *)mem_ptr->c1->ptr,
|
||||
(Torus *)mem_ptr->c1->ptr, encoded_scalar,
|
||||
radix_params.big_lwe_dimension, 1);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->c1, mem_ptr->c1,
|
||||
o2, 1, 4, 4);
|
||||
|
||||
// c0 = o1 (direct copy)
|
||||
copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->c0, 0, 1, o1, 0, 1);
|
||||
|
||||
auto conditional_update = [&](cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *cx,
|
||||
CudaRadixCiphertextFFI *rx,
|
||||
int_radix_lut<Torus> *lut, Torus factor) {
|
||||
auto rx_list = to_lwe_ciphertext_list(rx);
|
||||
host_cleartext_multiplication<Torus>(streams[0], gpu_indexes[0],
|
||||
(Torus *)rx->ptr, &rx_list, factor);
|
||||
host_add_the_same_block_to_all_blocks<Torus>(streams[0], gpu_indexes[0],
|
||||
rx, rx, cx, 4, 4);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, rx, rx, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, rx->num_radix_blocks);
|
||||
};
|
||||
|
||||
auto calculate_quotient_bits =
|
||||
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *q,
|
||||
CudaRadixCiphertextFFI *c, int_radix_lut<Torus> *lut) {
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, q, c, bsks, ksks,
|
||||
ms_noise_reduction_key, lut, 1);
|
||||
};
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
conditional_update(mem_ptr->sub_streams_1, gpu_indexes, gpu_count,
|
||||
mem_ptr->c3, r3, mem_ptr->zero_out_if_not_1_lut_1, 2);
|
||||
conditional_update(mem_ptr->sub_streams_2, gpu_indexes, gpu_count,
|
||||
mem_ptr->c2, r2, mem_ptr->zero_out_if_not_2_lut_1, 3);
|
||||
conditional_update(mem_ptr->sub_streams_3, gpu_indexes, gpu_count,
|
||||
mem_ptr->c1, r1, mem_ptr->zero_out_if_not_2_lut_2, 3);
|
||||
conditional_update(mem_ptr->sub_streams_4, gpu_indexes, gpu_count,
|
||||
mem_ptr->c0, mem_ptr->rem,
|
||||
mem_ptr->zero_out_if_not_1_lut_2, 2);
|
||||
|
||||
calculate_quotient_bits(mem_ptr->sub_streams_5, gpu_indexes, 1, mem_ptr->q1,
|
||||
mem_ptr->c1, mem_ptr->quotient_lut_1);
|
||||
calculate_quotient_bits(mem_ptr->sub_streams_6, gpu_indexes, 1, mem_ptr->q2,
|
||||
mem_ptr->c2, mem_ptr->quotient_lut_2);
|
||||
calculate_quotient_bits(mem_ptr->sub_streams_7, gpu_indexes, 1, mem_ptr->q3,
|
||||
mem_ptr->c3, mem_ptr->quotient_lut_3);
|
||||
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_4[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_5[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_6[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_7[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
|
||||
r3, mem_ptr->rem->num_radix_blocks, 4, 4);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
|
||||
r2, mem_ptr->rem->num_radix_blocks, 4, 4);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->rem, mem_ptr->rem,
|
||||
r1, mem_ptr->rem->num_radix_blocks, 4, 4);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
|
||||
mem_ptr->q2, 1, 4, 4);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], mem_ptr->q1, mem_ptr->q1,
|
||||
mem_ptr->q3, 1, 4, 4);
|
||||
|
||||
for (uint j = 0; j < gpu_count; j++) {
|
||||
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
|
||||
}
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, mem_ptr->rem,
|
||||
mem_ptr->rem, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->message_extract_lut_1, mem_ptr->rem->num_radix_blocks);
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, mem_ptr->q1,
|
||||
mem_ptr->q1, bsks, ksks, ms_noise_reduction_key,
|
||||
mem_ptr->message_extract_lut_2, 1);
|
||||
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
|
||||
cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
|
||||
}
|
||||
|
||||
size_t tmp_rem_size = mem_ptr->rem->num_radix_blocks;
|
||||
mem_ptr->rem->num_radix_blocks = remainder->num_radix_blocks;
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], remainder, block_index,
|
||||
remainder->num_radix_blocks, mem_ptr->rem, 0, tmp_rem_size);
|
||||
mem_ptr->rem->num_radix_blocks = tmp_rem_size;
|
||||
|
||||
insert_block_in_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
mem_ptr->q1, quotient, 0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ void host_unsigned_integer_div_rem_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -50,6 +401,14 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
remainder->lwe_dimension != divisor->lwe_dimension ||
|
||||
remainder->lwe_dimension != quotient->lwe_dimension)
|
||||
PANIC("Cuda error: input and output lwe dimension must be equal")
|
||||
|
||||
if (mem_ptr->params.message_modulus == 4 &&
|
||||
mem_ptr->params.carry_modulus == 4) {
|
||||
host_unsigned_integer_div_rem_kb_block_by_block_2_2<Torus>(
|
||||
streams, gpu_indexes, gpu_count, quotient, remainder, numerator,
|
||||
divisor, bsks, ksks, ms_noise_reduction_key, mem_ptr->div_rem_2_2_mem);
|
||||
return;
|
||||
}
|
||||
auto radix_params = mem_ptr->params;
|
||||
auto num_blocks = quotient->num_radix_blocks;
|
||||
|
||||
|
||||
@@ -7,6 +7,12 @@
|
||||
#include "utils/helper_profile.cuh"
|
||||
#include "utils/kernel_dimensions.cuh"
|
||||
|
||||
inline CudaLweCiphertextListFFI
|
||||
to_lwe_ciphertext_list(CudaRadixCiphertextFFI *radix) {
|
||||
return {.ptr = radix->ptr,
|
||||
.num_radix_blocks = radix->num_radix_blocks,
|
||||
.lwe_dimension = radix->lwe_dimension};
|
||||
}
|
||||
template <typename Torus>
|
||||
void create_zero_radix_ciphertext_async(cudaStream_t const stream,
|
||||
uint32_t const gpu_index,
|
||||
|
||||
@@ -105,134 +105,6 @@ const _: () = {
|
||||
ms_input_variance
|
||||
) - 32usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaLweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub num_radix_blocks: u32,
|
||||
pub lwe_dimension: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
|
||||
["Alignment of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaPackedGlweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub storage_log_modulus: u32,
|
||||
pub lwe_per_glwe: u32,
|
||||
pub total_lwe_bodies_count: u32,
|
||||
pub glwe_dimension: u32,
|
||||
pub polynomial_size: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
|
||||
["Alignment of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
|
||||
};
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
compression_glwe_dimension: u32,
|
||||
compression_polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
num_radix_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
lwe_per_glwe: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
encryption_glwe_dimension: u32,
|
||||
encryption_polynomial_size: u32,
|
||||
compression_glwe_dimension: u32,
|
||||
compression_polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
num_blocks_to_decompress: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
|
||||
lwe_array_in: *const CudaLweCiphertextListFFI,
|
||||
fp_ksk: *const *mut ffi::c_void,
|
||||
mem_ptr: *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
lwe_array_out: *mut CudaLweCiphertextListFFI,
|
||||
glwe_in: *const CudaPackedGlweCiphertextListFFI,
|
||||
indexes_array: *const u32,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
mem_ptr: *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
pub const SHIFT_OR_ROTATE_TYPE_LEFT_SHIFT: SHIFT_OR_ROTATE_TYPE = 0;
|
||||
pub const SHIFT_OR_ROTATE_TYPE_RIGHT_SHIFT: SHIFT_OR_ROTATE_TYPE = 1;
|
||||
pub const SHIFT_OR_ROTATE_TYPE_LEFT_ROTATE: SHIFT_OR_ROTATE_TYPE = 2;
|
||||
@@ -367,6 +239,55 @@ const _: () = {
|
||||
divisor_has_more_bits_than_numerator
|
||||
) - 60usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaLweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub num_radix_blocks: u32,
|
||||
pub lwe_dimension: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaLweCiphertextListFFI>() - 16usize];
|
||||
["Alignment of CudaLweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaLweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::num_radix_blocks"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, num_radix_blocks) - 8usize];
|
||||
["Offset of field: CudaLweCiphertextListFFI::lwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaLweCiphertextListFFI, lwe_dimension) - 12usize];
|
||||
};
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
pub struct CudaPackedGlweCiphertextListFFI {
|
||||
pub ptr: *mut ffi::c_void,
|
||||
pub storage_log_modulus: u32,
|
||||
pub lwe_per_glwe: u32,
|
||||
pub total_lwe_bodies_count: u32,
|
||||
pub glwe_dimension: u32,
|
||||
pub polynomial_size: u32,
|
||||
}
|
||||
#[allow(clippy::unnecessary_operation, clippy::identity_op)]
|
||||
const _: () = {
|
||||
["Size of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::size_of::<CudaPackedGlweCiphertextListFFI>() - 32usize];
|
||||
["Alignment of CudaPackedGlweCiphertextListFFI"]
|
||||
[::std::mem::align_of::<CudaPackedGlweCiphertextListFFI>() - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::ptr"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, ptr) - 0usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::storage_log_modulus"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, storage_log_modulus) - 8usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::lwe_per_glwe"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, lwe_per_glwe) - 12usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::total_lwe_bodies_count"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, total_lwe_bodies_count) - 16usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::glwe_dimension"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, glwe_dimension) - 20usize];
|
||||
["Offset of field: CudaPackedGlweCiphertextListFFI::polynomial_size"]
|
||||
[::std::mem::offset_of!(CudaPackedGlweCiphertextListFFI, polynomial_size) - 24usize];
|
||||
};
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_apply_univariate_lut_kb_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
@@ -1934,6 +1855,85 @@ unsafe extern "C" {
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
compression_glwe_dimension: u32,
|
||||
compression_polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
num_radix_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
lwe_per_glwe: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
encryption_glwe_dimension: u32,
|
||||
encryption_polynomial_size: u32,
|
||||
compression_glwe_dimension: u32,
|
||||
compression_polynomial_size: u32,
|
||||
lwe_dimension: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
num_blocks_to_decompress: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
glwe_array_out: *mut CudaPackedGlweCiphertextListFFI,
|
||||
lwe_array_in: *const CudaLweCiphertextListFFI,
|
||||
fp_ksk: *const *mut ffi::c_void,
|
||||
mem_ptr: *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
lwe_array_out: *mut CudaLweCiphertextListFFI,
|
||||
glwe_in: *const CudaPackedGlweCiphertextListFFI,
|
||||
indexes_array: *const u32,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
mem_ptr: *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
|
||||
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
|
||||
pub type KS_TYPE = ffi::c_uint;
|
||||
|
||||
Reference in New Issue
Block a user