feat(gpu): ks32 in integer ops

This commit is contained in:
Andrei Stoian
2025-12-28 15:02:49 +01:00
parent 0ee4342d9a
commit 2cae5003dc
73 changed files with 1703 additions and 1512 deletions

View File

@@ -11,10 +11,10 @@
* - FLUSH: to clear carry bits and isolate the message bit (x -> x & 1).
* - CARRY: to extract the carry bit for additions (x -> (x >> 1) & 1).
*/
template <typename Torus> struct int_aes_lut_buffers {
int_radix_lut<Torus> *and_lut;
int_radix_lut<Torus> *flush_lut;
int_radix_lut<Torus> *carry_lut;
template <typename Torus, typename KSTorus> struct int_aes_lut_buffers {
int_radix_lut<Torus, KSTorus> *and_lut;
int_radix_lut<Torus, KSTorus> *flush_lut;
int_radix_lut<Torus, KSTorus> *carry_lut;
int_aes_lut_buffers(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_aes_inputs,
@@ -23,7 +23,7 @@ template <typename Torus> struct int_aes_lut_buffers {
constexpr uint32_t AES_STATE_BITS = 128;
constexpr uint32_t SBOX_MAX_AND_GATES = 18;
this->and_lut = new int_radix_lut<Torus>(
this->and_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1,
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
allocate_gpu_memory, size_tracker);
@@ -39,7 +39,7 @@ template <typename Torus> struct int_aes_lut_buffers {
this->and_lut->broadcast_lut(active_streams_and_lut);
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->flush_lut = new int_radix_lut<Torus>(
this->flush_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, AES_STATE_BITS * num_aes_inputs,
allocate_gpu_memory, size_tracker);
std::function<Torus(Torus)> flush_lambda = [](Torus x) -> Torus {
@@ -55,7 +55,7 @@ template <typename Torus> struct int_aes_lut_buffers {
this->flush_lut->broadcast_lut(active_streams_flush_lut);
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
this->carry_lut = new int_radix_lut<Torus>(
this->carry_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_aes_inputs, allocate_gpu_memory, size_tracker);
std::function<Torus(Torus)> carry_lambda = [](Torus x) -> Torus {
return (x >> 1) & 1;
@@ -92,7 +92,7 @@ template <typename Torus> struct int_aes_lut_buffers {
* temporary values like copies of columns or the results of multiplications,
* avoiding overwriting data that is still needed in the same round.
*/
template <typename Torus> struct int_aes_round_workspaces {
template <typename Torus, typename KSTorus> struct int_aes_round_workspaces {
CudaRadixCiphertextFFI *mix_columns_col_copy_buffer;
CudaRadixCiphertextFFI *mix_columns_mul_workspace_buffer;
CudaRadixCiphertextFFI *vec_tmp_bit_buffer;
@@ -154,7 +154,7 @@ template <typename Torus> struct int_aes_round_workspaces {
* addition, such as the buffer for the propagating carry bit
* (`vec_tmp_carry_buffer`) across the addition chain.
*/
template <typename Torus> struct int_aes_counter_workspaces {
template <typename Torus, typename KSTorus> struct int_aes_counter_workspaces {
CudaRadixCiphertextFFI *vec_tmp_carry_buffer;
CudaRadixCiphertextFFI *vec_tmp_sum_buffer;
CudaRadixCiphertextFFI *vec_trivial_b_bits_buffer;
@@ -230,7 +230,7 @@ template <typename Torus> struct int_aes_counter_workspaces {
* - Other buffers are used for data layout transformations (transposition) and
* for batching small operations into larger, more efficient launches.
*/
template <typename Torus> struct int_aes_main_workspaces {
template <typename Torus, typename KSTorus> struct int_aes_main_workspaces {
CudaRadixCiphertextFFI *sbox_internal_workspace;
CudaRadixCiphertextFFI *initial_states_and_jit_key_workspace;
CudaRadixCiphertextFFI *main_bitsliced_states_buffer;
@@ -318,16 +318,16 @@ template <typename Torus> struct int_aes_main_workspaces {
* single object to manage the entire lifecycle of memory needed for a complete
* AES-CTR encryption operation.
*/
template <typename Torus> struct int_aes_encrypt_buffer {
template <typename Torus, typename KSTorus> struct int_aes_encrypt_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_aes_inputs;
uint32_t sbox_parallel_instances;
int_aes_lut_buffers<Torus> *luts;
int_aes_round_workspaces<Torus> *round_workspaces;
int_aes_counter_workspaces<Torus> *counter_workspaces;
int_aes_main_workspaces<Torus> *main_workspaces;
int_aes_lut_buffers<Torus, KSTorus> *luts;
int_aes_round_workspaces<Torus, KSTorus> *round_workspaces;
int_aes_counter_workspaces<Torus, KSTorus> *counter_workspaces;
int_aes_main_workspaces<Torus, KSTorus> *main_workspaces;
int_aes_encrypt_buffer(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint32_t num_aes_inputs,
@@ -341,17 +341,17 @@ template <typename Torus> struct int_aes_encrypt_buffer {
this->num_aes_inputs = num_aes_inputs;
this->sbox_parallel_instances = sbox_parallelism;
this->luts = new int_aes_lut_buffers<Torus>(
this->luts = new int_aes_lut_buffers<Torus, KSTorus>(
streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
size_tracker);
this->round_workspaces = new int_aes_round_workspaces<Torus>(
this->round_workspaces = new int_aes_round_workspaces<Torus, KSTorus>(
streams, params, allocate_gpu_memory, num_aes_inputs, size_tracker);
this->counter_workspaces = new int_aes_counter_workspaces<Torus>(
this->counter_workspaces = new int_aes_counter_workspaces<Torus, KSTorus>(
streams, params, allocate_gpu_memory, num_aes_inputs, size_tracker);
this->main_workspaces = new int_aes_main_workspaces<Torus>(
this->main_workspaces = new int_aes_main_workspaces<Torus, KSTorus>(
streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
size_tracker);
}
@@ -384,7 +384,7 @@ template <typename Torus> struct int_aes_encrypt_buffer {
* This separation ensures that memory for key expansion can be allocated and
* freed independently of the main encryption process.
*/
template <typename Torus> struct int_key_expansion_buffer {
template <typename Torus, typename KSTorus> struct int_key_expansion_buffer {
int_radix_params params;
bool allocate_gpu_memory;
@@ -393,7 +393,7 @@ template <typename Torus> struct int_key_expansion_buffer {
CudaRadixCiphertextFFI *tmp_word_buffer;
CudaRadixCiphertextFFI *tmp_rotated_word_buffer;
int_aes_encrypt_buffer<Torus> *aes_encrypt_buffer;
int_aes_encrypt_buffer<Torus, KSTorus> *aes_encrypt_buffer;
int_key_expansion_buffer(CudaStreams streams, const int_radix_params &params,
bool allocate_gpu_memory, uint64_t &size_tracker) {
@@ -421,7 +421,7 @@ template <typename Torus> struct int_key_expansion_buffer {
BITS_PER_WORD, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
this->aes_encrypt_buffer = new int_aes_encrypt_buffer<Torus>(
this->aes_encrypt_buffer = new int_aes_encrypt_buffer<Torus, KSTorus>(
streams, params, allocate_gpu_memory, 1, 4, size_tracker);
}
@@ -445,7 +445,8 @@ template <typename Torus> struct int_key_expansion_buffer {
}
};
template <typename Torus> struct int_key_expansion_256_buffer {
template <typename Torus, typename KSTorus>
struct int_key_expansion_256_buffer {
int_radix_params params;
bool allocate_gpu_memory;
@@ -454,7 +455,7 @@ template <typename Torus> struct int_key_expansion_256_buffer {
CudaRadixCiphertextFFI *tmp_word_buffer;
CudaRadixCiphertextFFI *tmp_rotated_word_buffer;
int_aes_encrypt_buffer<Torus> *aes_encrypt_buffer;
int_aes_encrypt_buffer<Torus, KSTorus> *aes_encrypt_buffer;
int_key_expansion_256_buffer(CudaStreams streams,
const int_radix_params &params,
@@ -484,7 +485,7 @@ template <typename Torus> struct int_key_expansion_256_buffer {
BITS_PER_WORD, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
this->aes_encrypt_buffer = new int_aes_encrypt_buffer<Torus>(
this->aes_encrypt_buffer = new int_aes_encrypt_buffer<Torus, KSTorus>(
streams, params, allocate_gpu_memory, 1, 4, size_tracker);
}

View File

@@ -3,12 +3,13 @@
#include "integer_utilities.h"
#include "scalar_shifts.h"
template <typename Torus> struct int_abs_buffer {
template <typename Torus, typename KSTorus> struct int_abs_buffer {
int_radix_params params;
int_arithmetic_scalar_shift_buffer<Torus> *arithmetic_scalar_shift_mem;
int_sc_prop_memory<Torus> *scp_mem;
int_bitop_buffer<Torus> *bitxor_mem;
int_arithmetic_scalar_shift_buffer<Torus, KSTorus>
*arithmetic_scalar_shift_mem;
int_sc_prop_memory<Torus, KSTorus> *scp_mem;
int_bitop_buffer<Torus, KSTorus> *bitxor_mem;
CudaRadixCiphertextFFI *mask;
bool allocate_gpu_memory;
@@ -18,16 +19,17 @@ template <typename Torus> struct int_abs_buffer {
uint64_t &size_tracker) {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
arithmetic_scalar_shift_mem = new int_arithmetic_scalar_shift_buffer<Torus>(
streams, SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
arithmetic_scalar_shift_mem =
new int_arithmetic_scalar_shift_buffer<Torus, KSTorus>(
streams, SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params,
num_radix_blocks, allocate_gpu_memory, size_tracker);
uint32_t requested_flag = outputFlag::FLAG_NONE;
scp_mem = new int_sc_prop_memory<Torus>(streams, params, num_radix_blocks,
requested_flag, allocate_gpu_memory,
size_tracker);
bitxor_mem = new int_bitop_buffer<Torus>(streams, BITOP_TYPE::BITXOR,
params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
scp_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
bitxor_mem = new int_bitop_buffer<Torus, KSTorus>(
streams, BITOP_TYPE::BITXOR, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
mask = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -1,11 +1,11 @@
#pragma once
#include "integer_utilities.h"
template <typename Torus> struct boolean_bitop_buffer {
template <typename Torus, typename KSTorus> struct boolean_bitop_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
int_radix_lut<Torus> *message_extract_lut;
int_radix_lut<Torus, KSTorus> *lut;
int_radix_lut<Torus, KSTorus> *message_extract_lut;
CudaRadixCiphertextFFI *tmp_lwe_left;
CudaRadixCiphertextFFI *tmp_lwe_right;
@@ -26,8 +26,9 @@ template <typename Torus> struct boolean_bitop_buffer {
case BITAND:
case BITOR:
case BITXOR:
lut = new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
allocate_gpu_memory, size_tracker);
lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, lwe_ciphertext_count, allocate_gpu_memory,
size_tracker);
{
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
if (op == BITOP_TYPE::BITAND) {
@@ -57,9 +58,9 @@ template <typename Torus> struct boolean_bitop_buffer {
}
if (!unchecked) {
message_extract_lut =
new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
gpu_memory_allocated, size_tracker);
message_extract_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, lwe_ciphertext_count, gpu_memory_allocated,
size_tracker);
auto lut_f_message_extract = [params](Torus x) -> Torus {
return x % params.message_modulus;
};
@@ -106,10 +107,10 @@ template <typename Torus> struct boolean_bitop_buffer {
}
};
template <typename Torus> struct int_bitop_buffer {
template <typename Torus, typename KSTorus> struct int_bitop_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
int_radix_lut<Torus, KSTorus> *lut;
BITOP_TYPE op;
bool gpu_memory_allocated;
@@ -124,8 +125,9 @@ template <typename Torus> struct int_bitop_buffer {
case BITAND:
case BITOR:
case BITXOR:
lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
{
auto lut_bivariate_f = [op](Torus lhs, Torus rhs) -> Torus {
if (op == BITOP_TYPE::BITAND) {
@@ -150,9 +152,9 @@ template <typename Torus> struct int_bitop_buffer {
break;
default:
// Scalar OP
lut = new int_radix_lut<Torus>(streams, params, params.message_modulus,
num_radix_blocks, allocate_gpu_memory,
size_tracker);
lut = new int_radix_lut<Torus, KSTorus>(
streams, params, params.message_modulus, num_radix_blocks,
allocate_gpu_memory, size_tracker);
for (int i = 0; i < params.message_modulus; i++) {
auto rhs = i;
@@ -187,9 +189,9 @@ template <typename Torus> struct int_bitop_buffer {
}
};
template <typename Torus> struct boolean_bitnot_buffer {
template <typename Torus, typename KSTorus> struct boolean_bitnot_buffer {
int_radix_params params;
int_radix_lut<Torus> *message_extract_lut;
int_radix_lut<Torus, KSTorus> *message_extract_lut;
bool gpu_memory_allocated;
bool unchecked;
boolean_bitnot_buffer(CudaStreams streams, int_radix_params params,
@@ -202,9 +204,9 @@ template <typename Torus> struct boolean_bitnot_buffer {
auto message_modulus = params.message_modulus;
if (!unchecked) {
message_extract_lut =
new int_radix_lut<Torus>(streams, params, 1, lwe_ciphertext_count,
gpu_memory_allocated, size_tracker);
message_extract_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, lwe_ciphertext_count, gpu_memory_allocated,
size_tracker);
auto lut_f_message_extract = [message_modulus](Torus x) -> Torus {
return x % message_modulus;
};

View File

@@ -1,12 +1,13 @@
#pragma once
#include "integer_utilities.h"
template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
template <typename Torus, typename KSTorus>
struct int_extend_radix_with_sign_msb_buffer {
int_radix_params params;
bool allocate_gpu_memory;
int_radix_lut<Torus> *lut = nullptr;
int_radix_lut<Torus, KSTorus> *lut = nullptr;
CudaRadixCiphertextFFI *last_block = nullptr;
CudaRadixCiphertextFFI *padding_block = nullptr;
@@ -22,8 +23,9 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
if (num_additional_blocks != 0) {
this->lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
this->lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
uint32_t bits_per_block = std::log2(params.message_modulus);
uint32_t msg_modulus = params.message_modulus;
@@ -76,15 +78,15 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
}
};
template <typename Torus> struct int_cast_to_unsigned_buffer {
template <typename Torus, typename KSTorus> struct int_cast_to_unsigned_buffer {
int_radix_params params;
bool allocate_gpu_memory;
bool requires_full_propagate;
bool requires_sign_extension;
int_fullprop_buffer<Torus> *prop_buffer;
int_extend_radix_with_sign_msb_buffer<Torus> *extend_buffer;
int_fullprop_buffer<Torus, KSTorus> *prop_buffer;
int_extend_radix_with_sign_msb_buffer<Torus, KSTorus> *extend_buffer;
int_cast_to_unsigned_buffer(CudaStreams streams, int_radix_params params,
uint32_t num_input_blocks,
@@ -100,7 +102,7 @@ template <typename Torus> struct int_cast_to_unsigned_buffer {
this->extend_buffer = nullptr;
if (requires_full_propagate) {
this->prop_buffer = new int_fullprop_buffer<Torus>(
this->prop_buffer = new int_fullprop_buffer<Torus, KSTorus>(
streams, params, allocate_gpu_memory, size_tracker);
}
@@ -109,9 +111,10 @@ template <typename Torus> struct int_cast_to_unsigned_buffer {
if (this->requires_sign_extension) {
uint32_t num_blocks_to_add = target_num_blocks - num_input_blocks;
this->extend_buffer = new int_extend_radix_with_sign_msb_buffer<Torus>(
streams, params, num_input_blocks, num_blocks_to_add,
allocate_gpu_memory, size_tracker);
this->extend_buffer =
new int_extend_radix_with_sign_msb_buffer<Torus, KSTorus>(
streams, params, num_input_blocks, num_blocks_to_add,
allocate_gpu_memory, size_tracker);
}
}
@@ -128,13 +131,13 @@ template <typename Torus> struct int_cast_to_unsigned_buffer {
}
};
template <typename Torus> struct int_cast_to_signed_buffer {
template <typename Torus, typename KSTorus> struct int_cast_to_signed_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_input_blocks;
uint32_t target_num_blocks;
int_extend_radix_with_sign_msb_buffer<Torus> *extend_buffer;
int_extend_radix_with_sign_msb_buffer<Torus, KSTorus> *extend_buffer;
int_cast_to_signed_buffer(CudaStreams streams, int_radix_params params,
uint32_t num_input_blocks,
@@ -148,9 +151,10 @@ template <typename Torus> struct int_cast_to_signed_buffer {
if (input_is_signed && target_num_blocks > num_input_blocks) {
uint32_t num_additional_blocks = target_num_blocks - num_input_blocks;
this->extend_buffer = new int_extend_radix_with_sign_msb_buffer<Torus>(
streams, params, num_input_blocks, num_additional_blocks,
allocate_gpu_memory, size_tracker);
this->extend_buffer =
new int_extend_radix_with_sign_msb_buffer<Torus, KSTorus>(
streams, params, num_input_blocks, num_additional_blocks,
allocate_gpu_memory, size_tracker);
}
}

View File

@@ -1,7 +1,7 @@
#pragma once
#include "integer_utilities.h"
template <typename Torus> struct int_zero_out_if_buffer {
template <typename Torus, typename KSTorus> struct int_zero_out_if_buffer {
int_radix_params params;
@@ -29,9 +29,9 @@ template <typename Torus> struct int_zero_out_if_buffer {
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
template <typename Torus> struct int_cmux_buffer {
int_radix_lut<Torus> *predicate_lut;
int_radix_lut<Torus> *message_extract_lut;
template <typename Torus, typename KSTorus> struct int_cmux_buffer {
int_radix_lut<Torus, KSTorus> *predicate_lut;
int_radix_lut<Torus, KSTorus> *message_extract_lut;
CudaRadixCiphertextFFI *buffer_in;
CudaRadixCiphertextFFI *buffer_out;
@@ -76,13 +76,13 @@ template <typename Torus> struct int_cmux_buffer {
return x % params.message_modulus;
};
predicate_lut =
new int_radix_lut<Torus>(streams, params, 2, 2 * num_radix_blocks,
allocate_gpu_memory, size_tracker);
predicate_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 2, 2 * num_radix_blocks, allocate_gpu_memory,
size_tracker);
message_extract_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), predicate_lut->get_lut(0, 0),

View File

@@ -2,7 +2,8 @@
#include "cmux.h"
#include "integer_utilities.h"
template <typename Torus> struct int_are_all_block_true_buffer {
template <typename Torus, typename KSTorus>
struct int_are_all_block_true_buffer {
COMPARISON_TYPE op;
int_radix_params params;
@@ -12,7 +13,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
// This map store LUTs that checks the equality between some input and values
// of interest in are_all_block_true(), as with max_value (the maximum message
// value).
int_radix_lut<Torus> *is_max_value;
int_radix_lut<Torus, KSTorus> *is_max_value;
Torus *preallocated_h_lut;
bool gpu_memory_allocated;
@@ -39,8 +40,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
max_chunks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
is_max_value = new int_radix_lut<Torus>(streams, params, 2, max_chunks,
allocate_gpu_memory, size_tracker);
is_max_value = new int_radix_lut<Torus, KSTorus>(
streams, params, 2, max_chunks, allocate_gpu_memory, size_tracker);
auto is_max_value_f = [max_value](Torus x) -> Torus {
return x == max_value;
};
@@ -70,15 +71,15 @@ template <typename Torus> struct int_are_all_block_true_buffer {
}
};
template <typename Torus> struct int_comparison_eq_buffer {
template <typename Torus, typename KSTorus> struct int_comparison_eq_buffer {
int_radix_params params;
COMPARISON_TYPE op;
int_radix_lut<Torus> *operator_lut;
int_radix_lut<Torus> *is_non_zero_lut;
int_radix_lut<Torus> *scalar_comparison_luts;
int_radix_lut<Torus, KSTorus> *operator_lut;
int_radix_lut<Torus, KSTorus> *is_non_zero_lut;
int_radix_lut<Torus, KSTorus> *scalar_comparison_luts;
int_are_all_block_true_buffer<Torus> *are_all_block_true_buffer;
int_are_all_block_true_buffer<Torus, KSTorus> *are_all_block_true_buffer;
bool gpu_memory_allocated;
int_comparison_eq_buffer(CudaStreams streams, COMPARISON_TYPE op,
@@ -89,9 +90,10 @@ template <typename Torus> struct int_comparison_eq_buffer {
this->op = op;
Torus total_modulus = params.message_modulus * params.carry_modulus;
are_all_block_true_buffer = new int_are_all_block_true_buffer<Torus>(
streams, op, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
are_all_block_true_buffer =
new int_are_all_block_true_buffer<Torus, KSTorus>(
streams, op, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
// f(x) -> x == 0
auto is_non_zero_lut_f = [total_modulus](Torus x) -> Torus {
@@ -99,8 +101,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
};
is_non_zero_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_non_zero_lut->get_lut(0, 0),
@@ -112,7 +114,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
is_non_zero_lut->broadcast_lut(active_streams);
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
scalar_comparison_luts = new int_radix_lut<Torus, KSTorus>(
streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory,
size_tracker);
@@ -142,9 +144,9 @@ template <typename Torus> struct int_comparison_eq_buffer {
}
scalar_comparison_luts->broadcast_lut(active_streams);
if (op == COMPARISON_TYPE::EQ || op == COMPARISON_TYPE::NE) {
operator_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
operator_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
@@ -179,15 +181,16 @@ template <typename Torus> struct int_comparison_eq_buffer {
}
};
template <typename Torus> struct int_tree_sign_reduction_buffer {
template <typename Torus, typename KSTorus>
struct int_tree_sign_reduction_buffer {
int_radix_params params;
std::function<Torus(Torus, Torus)> block_selector_f;
int_radix_lut<Torus> *tree_inner_leaf_lut;
int_radix_lut<Torus> *tree_last_leaf_lut;
int_radix_lut<Torus, KSTorus> *tree_inner_leaf_lut;
int_radix_lut<Torus, KSTorus> *tree_last_leaf_lut;
int_radix_lut<Torus> *tree_last_leaf_scalar_lut;
int_radix_lut<Torus, KSTorus> *tree_last_leaf_scalar_lut;
Torus *preallocated_h_lut;
CudaRadixCiphertextFFI *tmp_x;
@@ -220,16 +223,16 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// LUTs
tree_inner_leaf_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
tree_last_leaf_lut = new int_radix_lut<Torus>(
tree_last_leaf_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
preallocated_h_lut = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
tree_last_leaf_scalar_lut = new int_radix_lut<Torus>(
tree_last_leaf_scalar_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
@@ -261,7 +264,7 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
}
};
template <typename Torus> struct int_comparison_diff_buffer {
template <typename Torus, typename KSTorus> struct int_comparison_diff_buffer {
int_radix_params params;
COMPARISON_TYPE op;
@@ -269,11 +272,11 @@ template <typename Torus> struct int_comparison_diff_buffer {
std::function<Torus(Torus)> operator_f;
int_tree_sign_reduction_buffer<Torus> *tree_buffer;
int_tree_sign_reduction_buffer<Torus, KSTorus> *tree_buffer;
CudaRadixCiphertextFFI *tmp_signs_a;
CudaRadixCiphertextFFI *tmp_signs_b;
int_radix_lut<Torus> *reduce_signs_lut;
int_radix_lut<Torus, KSTorus> *reduce_signs_lut;
bool gpu_memory_allocated;
Torus *preallocated_h_lut1;
Torus *preallocated_h_lut2;
@@ -304,7 +307,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
streams.stream(0), streams.gpu_index(0), tmp_packed, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
tree_buffer = new int_tree_sign_reduction_buffer<Torus>(
tree_buffer = new int_tree_sign_reduction_buffer<Torus, KSTorus>(
streams, operator_f, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
tmp_signs_a = new CudaRadixCiphertextFFI;
@@ -317,8 +320,8 @@ template <typename Torus> struct int_comparison_diff_buffer {
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
// LUTs
reduce_signs_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
preallocated_h_lut1 = (Torus *)malloc(
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus));
preallocated_h_lut2 = (Torus *)malloc(
@@ -346,19 +349,19 @@ template <typename Torus> struct int_comparison_diff_buffer {
}
};
template <typename Torus> struct int_comparison_buffer {
template <typename Torus, typename KSTorus> struct int_comparison_buffer {
COMPARISON_TYPE op;
int_radix_params params;
//////////////////
int_radix_lut<Torus> *identity_lut;
int_radix_lut<Torus, KSTorus> *identity_lut;
std::function<Torus(Torus)> identity_lut_f;
int_radix_lut<Torus> *is_zero_lut;
int_radix_lut<Torus, KSTorus> *is_zero_lut;
int_comparison_eq_buffer<Torus> *eq_buffer;
int_comparison_diff_buffer<Torus> *diff_buffer;
int_comparison_eq_buffer<Torus, KSTorus> *eq_buffer;
int_comparison_diff_buffer<Torus, KSTorus> *diff_buffer;
CudaRadixCiphertextFFI *tmp_block_comparisons;
CudaRadixCiphertextFFI *tmp_lwe_array_out;
@@ -368,14 +371,14 @@ template <typename Torus> struct int_comparison_buffer {
CudaRadixCiphertextFFI *tmp_packed_input;
// Max Min
int_cmux_buffer<Torus> *cmux_buffer;
int_cmux_buffer<Torus, KSTorus> *cmux_buffer;
// Signed LUT
int_radix_lut<Torus> *signed_lut;
int_radix_lut<Torus, KSTorus> *signed_lut;
bool is_signed;
// Used for scalar comparisons
int_radix_lut<Torus> *signed_msb_lut;
int_radix_lut<Torus, KSTorus> *signed_msb_lut;
CudaStreams lsb_streams;
CudaStreams msb_streams;
bool gpu_memory_allocated;
@@ -419,8 +422,8 @@ template <typename Torus> struct int_comparison_buffer {
// Cleaning LUT
identity_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), identity_lut->get_lut(0, 0),
@@ -434,8 +437,9 @@ template <typename Torus> struct int_comparison_buffer {
return (x % total_modulus) == 0;
};
is_zero_lut = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
is_zero_lut =
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), is_zero_lut->get_lut(0, 0),
@@ -448,7 +452,7 @@ template <typename Torus> struct int_comparison_buffer {
switch (op) {
case COMPARISON_TYPE::MAX:
case COMPARISON_TYPE::MIN:
cmux_buffer = new int_cmux_buffer<Torus>(
cmux_buffer = new int_cmux_buffer<Torus, KSTorus>(
streams,
[op](Torus x) -> Torus {
if (op == COMPARISON_TYPE::MAX)
@@ -461,12 +465,12 @@ template <typename Torus> struct int_comparison_buffer {
case COMPARISON_TYPE::GE:
case COMPARISON_TYPE::LT:
case COMPARISON_TYPE::LE:
diff_buffer = new int_comparison_diff_buffer<Torus>(
diff_buffer = new int_comparison_diff_buffer<Torus, KSTorus>(
streams, op, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
case COMPARISON_TYPE::EQ:
case COMPARISON_TYPE::NE:
eq_buffer = new int_comparison_eq_buffer<Torus>(
eq_buffer = new int_comparison_eq_buffer<Torus, KSTorus>(
streams, op, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
break;
@@ -481,9 +485,9 @@ template <typename Torus> struct int_comparison_buffer {
streams.stream(0), streams.gpu_index(0), tmp_trivial_sign_block, 1,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
signed_lut = new int_radix_lut<Torus>(streams, params, 1, 1,
allocate_gpu_memory, size_tracker);
signed_msb_lut = new int_radix_lut<Torus>(
signed_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
signed_msb_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
auto message_modulus = (int)params.message_modulus;

View File

@@ -51,7 +51,7 @@ template <typename Torus> struct int_compression {
}
};
template <typename Torus> struct int_decompression {
template <typename Torus, typename KSTorus> struct int_decompression {
int_radix_params encryption_params;
int_radix_params compression_params;
uint32_t num_blocks_to_decompress;
@@ -60,7 +60,7 @@ template <typename Torus> struct int_decompression {
Torus *tmp_extracted_lwe;
uint32_t *tmp_indexes_array;
int_radix_lut<Torus> *decompression_rescale_lut;
int_radix_lut<Torus, KSTorus> *decompression_rescale_lut;
bool gpu_memory_allocated;
int_decompression(CudaStreams streams, int_radix_params encryption_params,
@@ -97,7 +97,7 @@ template <typename Torus> struct int_decompression {
// Example: in the 2_2 case we are mapping a 2-bit message onto a 4-bit
// space, we want to keep the original 2-bit value in the 4-bit space,
// so we apply the identity and the encoding will rescale it for us.
decompression_rescale_lut = new int_radix_lut<Torus>(
decompression_rescale_lut = new int_radix_lut<Torus, KSTorus>(
streams, encryption_params, 1, num_blocks_to_decompress,
allocate_gpu_memory, size_tracker);
auto decompression_rescale_f = [](Torus x) -> Torus { return x; };

View File

@@ -6,33 +6,34 @@
#include "scalar_shifts.h"
// used only when 4 gpus are available
template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
template <typename Torus, typename KSTorus>
struct unsigned_int_div_rem_2_2_memory {
int_radix_params params;
bool gpu_memory_allocated;
// memory objects for other operations
int_borrow_prop_memory<Torus> *overflow_sub_mem_1;
int_borrow_prop_memory<Torus> *overflow_sub_mem_2;
int_borrow_prop_memory<Torus> *overflow_sub_mem_3;
int_comparison_buffer<Torus> *comparison_buffer_1;
int_comparison_buffer<Torus> *comparison_buffer_2;
int_comparison_buffer<Torus> *comparison_buffer_3;
int_sub_and_propagate<Torus> *sub_and_propagate_mem;
int_bitop_buffer<Torus> *bitor_mem_1;
int_bitop_buffer<Torus> *bitor_mem_2;
int_bitop_buffer<Torus> *bitor_mem_3;
int_logical_scalar_shift_buffer<Torus> *shift_mem;
int_borrow_prop_memory<Torus, KSTorus> *overflow_sub_mem_1;
int_borrow_prop_memory<Torus, KSTorus> *overflow_sub_mem_2;
int_borrow_prop_memory<Torus, KSTorus> *overflow_sub_mem_3;
int_comparison_buffer<Torus, KSTorus> *comparison_buffer_1;
int_comparison_buffer<Torus, KSTorus> *comparison_buffer_2;
int_comparison_buffer<Torus, KSTorus> *comparison_buffer_3;
int_sub_and_propagate<Torus, KSTorus> *sub_and_propagate_mem;
int_bitop_buffer<Torus, KSTorus> *bitor_mem_1;
int_bitop_buffer<Torus, KSTorus> *bitor_mem_2;
int_bitop_buffer<Torus, KSTorus> *bitor_mem_3;
int_logical_scalar_shift_buffer<Torus, KSTorus> *shift_mem;
// lookup tables
int_radix_lut<Torus> *message_extract_lut_1;
int_radix_lut<Torus> *message_extract_lut_2;
int_radix_lut<Torus> *zero_out_if_not_1_lut_1;
int_radix_lut<Torus> *zero_out_if_not_1_lut_2;
int_radix_lut<Torus> *zero_out_if_not_2_lut_1;
int_radix_lut<Torus> *zero_out_if_not_2_lut_2;
int_radix_lut<Torus> *quotient_lut_1;
int_radix_lut<Torus> *quotient_lut_2;
int_radix_lut<Torus> *quotient_lut_3;
int_radix_lut<Torus, KSTorus> *message_extract_lut_1;
int_radix_lut<Torus, KSTorus> *message_extract_lut_2;
int_radix_lut<Torus, KSTorus> *zero_out_if_not_1_lut_1;
int_radix_lut<Torus, KSTorus> *zero_out_if_not_1_lut_2;
int_radix_lut<Torus, KSTorus> *zero_out_if_not_2_lut_1;
int_radix_lut<Torus, KSTorus> *zero_out_if_not_2_lut_2;
int_radix_lut<Torus, KSTorus> *quotient_lut_1;
int_radix_lut<Torus, KSTorus> *quotient_lut_2;
int_radix_lut<Torus, KSTorus> *quotient_lut_3;
// sub streams
CudaStreams sub_streams_1;
@@ -252,21 +253,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
void init_lookup_tables(CudaStreams streams, uint32_t num_blocks,
bool allocate_gpu_memory, uint64_t &size_tracker) {
zero_out_if_not_1_lut_1 =
new int_radix_lut<Torus>(streams.get_ith(0), params, 1, num_blocks,
allocate_gpu_memory, size_tracker);
zero_out_if_not_1_lut_1 = new int_radix_lut<Torus, KSTorus>(
streams.get_ith(0), params, 1, num_blocks, allocate_gpu_memory,
size_tracker);
zero_out_if_not_2_lut_1 =
new int_radix_lut<Torus>(streams.get_ith(1), params, 1, num_blocks,
allocate_gpu_memory, tmp_size_tracker);
zero_out_if_not_2_lut_1 = new int_radix_lut<Torus, KSTorus>(
streams.get_ith(1), params, 1, num_blocks, allocate_gpu_memory,
tmp_size_tracker);
zero_out_if_not_2_lut_2 =
new int_radix_lut<Torus>(streams.get_ith(2), params, 1, num_blocks,
allocate_gpu_memory, tmp_size_tracker);
zero_out_if_not_2_lut_2 = new int_radix_lut<Torus, KSTorus>(
streams.get_ith(2), params, 1, num_blocks, allocate_gpu_memory,
tmp_size_tracker);
zero_out_if_not_1_lut_2 =
new int_radix_lut<Torus>(streams.get_ith(3), params, 1, num_blocks,
allocate_gpu_memory, tmp_size_tracker);
zero_out_if_not_1_lut_2 = new int_radix_lut<Torus, KSTorus>(
streams.get_ith(3), params, 1, num_blocks, allocate_gpu_memory,
tmp_size_tracker);
auto zero_out_if_not_1_lut_f = [](Torus x) -> Torus {
Torus block = x / 2;
@@ -279,8 +280,8 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
return block * (Torus)condition;
};
int_radix_lut<Torus> *luts[2] = {zero_out_if_not_1_lut_1,
zero_out_if_not_1_lut_2};
int_radix_lut<Torus, KSTorus> *luts[2] = {zero_out_if_not_1_lut_1,
zero_out_if_not_1_lut_2};
size_t lut_gpu_indexes[2] = {0, 3};
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
@@ -304,13 +305,13 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
params.carry_modulus, zero_out_if_not_2_lut_f, gpu_memory_allocated);
}
quotient_lut_1 =
new int_radix_lut<Torus>(streams.get_ith(2), params, 1, 1,
allocate_gpu_memory, tmp_size_tracker);
quotient_lut_2 =
new int_radix_lut<Torus>(streams.get_ith(1), params, 1, 1,
allocate_gpu_memory, tmp_size_tracker);
quotient_lut_3 = new int_radix_lut<Torus>(
quotient_lut_1 = new int_radix_lut<Torus, KSTorus>(
streams.get_ith(2), params, 1, 1, allocate_gpu_memory,
tmp_size_tracker);
quotient_lut_2 = new int_radix_lut<Torus, KSTorus>(
streams.get_ith(1), params, 1, 1, allocate_gpu_memory,
tmp_size_tracker);
quotient_lut_3 = new int_radix_lut<Torus, KSTorus>(
streams.get_ith(0), params, 1, 1, allocate_gpu_memory, size_tracker);
auto quotient_lut_1_f = [](Torus cond) -> Torus {
@@ -337,9 +338,9 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, quotient_lut_3_f, gpu_memory_allocated);
message_extract_lut_1 = new int_radix_lut<Torus>(
message_extract_lut_1 = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
message_extract_lut_2 = new int_radix_lut<Torus>(
message_extract_lut_2 = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
auto message_modulus = params.message_modulus;
@@ -376,22 +377,22 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
this->params = params;
gpu_memory_allocated = allocate_gpu_memory;
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
sub_and_propagate_mem = new int_sub_and_propagate<Torus, KSTorus>(
streams.get_ith(0), params, num_blocks + 1, outputFlag::FLAG_NONE,
allocate_gpu_memory, size_tracker);
shift_mem = new int_logical_scalar_shift_buffer<Torus>(
shift_mem = new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams.get_ith(1), SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params,
2 * num_blocks, allocate_gpu_memory, tmp_size_tracker);
uint32_t compute_overflow = 1;
overflow_sub_mem_1 = new int_borrow_prop_memory<Torus>(
overflow_sub_mem_1 = new int_borrow_prop_memory<Torus, KSTorus>(
streams.get_ith(0), params, num_blocks, compute_overflow,
allocate_gpu_memory, size_tracker);
overflow_sub_mem_2 = new int_borrow_prop_memory<Torus>(
overflow_sub_mem_2 = new int_borrow_prop_memory<Torus, KSTorus>(
streams.get_ith(1), params, num_blocks, compute_overflow,
allocate_gpu_memory, tmp_size_tracker);
overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
overflow_sub_mem_3 = new int_borrow_prop_memory<Torus, KSTorus>(
streams.get_ith(2), params, num_blocks, compute_overflow,
allocate_gpu_memory, tmp_size_tracker);
uint32_t group_size = overflow_sub_mem_1->group_size;
@@ -419,22 +420,22 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
&second_indexes_for_overflow_sub_gpu_2, &scalars_for_overflow_sub_gpu_2,
num_blocks, allocate_gpu_memory, tmp_size_tracker);
comparison_buffer_1 = new int_comparison_buffer<Torus>(
comparison_buffer_1 = new int_comparison_buffer<Torus, KSTorus>(
streams.get_ith(0), COMPARISON_TYPE::EQ, params, num_blocks, false,
allocate_gpu_memory, size_tracker);
comparison_buffer_2 = new int_comparison_buffer<Torus>(
comparison_buffer_2 = new int_comparison_buffer<Torus, KSTorus>(
streams.get_ith(1), COMPARISON_TYPE::EQ, params, num_blocks, false,
allocate_gpu_memory, tmp_size_tracker);
comparison_buffer_3 = new int_comparison_buffer<Torus>(
comparison_buffer_3 = new int_comparison_buffer<Torus, KSTorus>(
streams.get_ith(2), COMPARISON_TYPE::EQ, params, num_blocks, false,
allocate_gpu_memory, tmp_size_tracker);
bitor_mem_1 = new int_bitop_buffer<Torus>(
bitor_mem_1 = new int_bitop_buffer<Torus, KSTorus>(
streams.get_ith(0), BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, size_tracker);
bitor_mem_2 = new int_bitop_buffer<Torus>(
bitor_mem_2 = new int_bitop_buffer<Torus, KSTorus>(
streams.get_ith(1), BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, tmp_size_tracker);
bitor_mem_3 = new int_bitop_buffer<Torus>(
bitor_mem_3 = new int_bitop_buffer<Torus, KSTorus>(
streams.get_ith(2), BITOP_TYPE::BITOR, params, num_blocks,
allocate_gpu_memory, tmp_size_tracker);
@@ -849,24 +850,24 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
}
};
template <typename Torus> struct unsigned_int_div_rem_memory {
template <typename Torus, typename KSTorus> struct unsigned_int_div_rem_memory {
int_radix_params params;
// memory objects for other operations
int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
int_logical_scalar_shift_buffer<Torus> *shift_mem_2;
int_borrow_prop_memory<Torus> *overflow_sub_mem;
int_comparison_buffer<Torus> *comparison_buffer;
unsigned_int_div_rem_2_2_memory<Torus> *div_rem_2_2_mem;
int_logical_scalar_shift_buffer<Torus, KSTorus> *shift_mem_1;
int_logical_scalar_shift_buffer<Torus, KSTorus> *shift_mem_2;
int_borrow_prop_memory<Torus, KSTorus> *overflow_sub_mem;
int_comparison_buffer<Torus, KSTorus> *comparison_buffer;
unsigned_int_div_rem_2_2_memory<Torus, KSTorus> *div_rem_2_2_mem;
// lookup tables
int_radix_lut<Torus> **masking_luts_1;
int_radix_lut<Torus> **masking_luts_2;
int_radix_lut<Torus> *message_extract_lut_1;
int_radix_lut<Torus> *message_extract_lut_2;
int_radix_lut<Torus> **zero_out_if_overflow_did_not_happen;
int_radix_lut<Torus> **zero_out_if_overflow_happened;
int_radix_lut<Torus> **merge_overflow_flags_luts;
int_radix_lut<Torus, KSTorus> **masking_luts_1;
int_radix_lut<Torus, KSTorus> **masking_luts_2;
int_radix_lut<Torus, KSTorus> *message_extract_lut_1;
int_radix_lut<Torus, KSTorus> *message_extract_lut_2;
int_radix_lut<Torus, KSTorus> **zero_out_if_overflow_did_not_happen;
int_radix_lut<Torus, KSTorus> **zero_out_if_overflow_happened;
int_radix_lut<Torus, KSTorus> **merge_overflow_flags_luts;
// sub streams
CudaStreams sub_streams_1;
@@ -994,16 +995,18 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
// create and generate masking_luts_1[] and masking_lut_2[]
// both of them are equal but because they are used in two different
// executions in parallel we need two different pbs_buffers.
masking_luts_1 = new int_radix_lut<Torus> *[params.message_modulus - 1];
masking_luts_2 = new int_radix_lut<Torus> *[params.message_modulus - 1];
masking_luts_1 =
new int_radix_lut<Torus, KSTorus> *[params.message_modulus - 1];
masking_luts_2 =
new int_radix_lut<Torus, KSTorus> *[params.message_modulus - 1];
for (int i = 0; i < params.message_modulus - 1; i++) {
uint32_t shifted_mask = i;
std::function<Torus(Torus)> lut_f_masking =
[shifted_mask](Torus x) -> Torus { return x & shifted_mask; };
masking_luts_1[i] = new int_radix_lut<Torus>(
masking_luts_1[i] = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
masking_luts_2[i] = new int_radix_lut<Torus>(
masking_luts_2[i] = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
@@ -1028,9 +1031,9 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
// create and generate message_extract_lut_1 and message_extract_lut_2
// both of them are equal but because they are used in two different
// executions in parallel we need two different pbs_buffers.
message_extract_lut_1 = new int_radix_lut<Torus>(
message_extract_lut_1 = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
message_extract_lut_2 = new int_radix_lut<Torus>(
message_extract_lut_2 = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
auto message_modulus = params.message_modulus;
@@ -1038,8 +1041,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
return x % message_modulus;
};
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
int_radix_lut<Torus, KSTorus> *luts[2] = {message_extract_lut_1,
message_extract_lut_2};
auto active_streams = streams.active_gpu_subset(num_blocks);
for (int j = 0; j < 2; j++) {
generate_device_accumulator<Torus>(
@@ -1059,10 +1062,11 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
};
// create and generate zero_out_if_overflow_did_not_happen
zero_out_if_overflow_did_not_happen = new int_radix_lut<Torus> *[2];
zero_out_if_overflow_did_not_happen[0] = new int_radix_lut<Torus>(
zero_out_if_overflow_did_not_happen =
new int_radix_lut<Torus, KSTorus> *[2];
zero_out_if_overflow_did_not_happen[0] = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
zero_out_if_overflow_did_not_happen[1] = new int_radix_lut<Torus>(
zero_out_if_overflow_did_not_happen[1] = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
auto cur_lut_f = [&](Torus block, Torus overflow_sum) -> Torus {
@@ -1093,10 +1097,10 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(active_streams);
// create and generate zero_out_if_overflow_happened
zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
zero_out_if_overflow_happened[0] = new int_radix_lut<Torus>(
zero_out_if_overflow_happened = new int_radix_lut<Torus, KSTorus> *[2];
zero_out_if_overflow_happened[0] = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
zero_out_if_overflow_happened[1] = new int_radix_lut<Torus>(
zero_out_if_overflow_happened[1] = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
auto overflow_happened_f = [&](Torus block, Torus overflow_sum) -> Torus {
@@ -1127,14 +1131,15 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
zero_out_if_overflow_happened[1]->broadcast_lut(active_streams);
// merge_overflow_flags_luts
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
merge_overflow_flags_luts =
new int_radix_lut<Torus, KSTorus> *[num_bits_in_message];
auto active_gpu_count_for_bits = streams.active_gpu_subset(1);
for (int i = 0; i < num_bits_in_message; i++) {
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
return (x == 0 && y == 0) << i;
};
merge_overflow_flags_luts[i] = new int_radix_lut<Torus>(
merge_overflow_flags_luts[i] = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
@@ -1157,21 +1162,21 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
if (params.message_modulus == 4 && params.carry_modulus == 4 &&
streams.count() >= 4) {
div_rem_2_2_mem = new unsigned_int_div_rem_2_2_memory<Torus>(
div_rem_2_2_mem = new unsigned_int_div_rem_2_2_memory<Torus, KSTorus>(
streams, params, num_blocks, allocate_gpu_memory, size_tracker);
return;
}
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
shift_mem_1 = new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks,
allocate_gpu_memory, size_tracker);
shift_mem_2 = new int_logical_scalar_shift_buffer<Torus>(
shift_mem_2 = new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT, params, 2 * num_blocks,
allocate_gpu_memory, size_tracker);
uint32_t compute_overflow = 1;
overflow_sub_mem = new int_borrow_prop_memory<Torus>(
overflow_sub_mem = new int_borrow_prop_memory<Torus, KSTorus>(
streams, params, num_blocks, compute_overflow, allocate_gpu_memory,
size_tracker);
uint32_t group_size = overflow_sub_mem->group_size;
@@ -1180,7 +1185,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
create_indexes_for_overflow_sub(streams, num_blocks, group_size, use_seq,
allocate_gpu_memory, size_tracker);
comparison_buffer = new int_comparison_buffer<Torus>(
comparison_buffer = new int_comparison_buffer<Torus, KSTorus>(
streams, COMPARISON_TYPE::NE, params, num_blocks, false,
allocate_gpu_memory, size_tracker);
@@ -1440,21 +1445,21 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
free(scalars_for_overflow_sub);
}
};
template <typename Torus> struct int_div_rem_memory {
template <typename Torus, typename KSTorus> struct int_div_rem_memory {
int_radix_params params;
CudaStreams active_streams;
bool is_signed;
// memory objects for other operations
unsigned_int_div_rem_memory<Torus> *unsigned_mem;
int_abs_buffer<Torus> *abs_mem_1;
int_abs_buffer<Torus> *abs_mem_2;
int_sc_prop_memory<Torus> *scp_mem_1;
int_sc_prop_memory<Torus> *scp_mem_2;
int_cmux_buffer<Torus> *cmux_quotient_mem;
int_cmux_buffer<Torus> *cmux_remainder_mem;
unsigned_int_div_rem_memory<Torus, KSTorus> *unsigned_mem;
int_abs_buffer<Torus, KSTorus> *abs_mem_1;
int_abs_buffer<Torus, KSTorus> *abs_mem_2;
int_sc_prop_memory<Torus, KSTorus> *scp_mem_1;
int_sc_prop_memory<Torus, KSTorus> *scp_mem_2;
int_cmux_buffer<Torus, KSTorus> *cmux_quotient_mem;
int_cmux_buffer<Torus, KSTorus> *cmux_remainder_mem;
// lookup tables
int_radix_lut<Torus> *compare_signed_bits_lut;
int_radix_lut<Torus, KSTorus> *compare_signed_bits_lut;
// sub streams
CudaStreams sub_streams_1;
@@ -1477,22 +1482,22 @@ template <typename Torus> struct int_div_rem_memory {
this->params = params;
this->is_signed = is_signed;
unsigned_mem = new unsigned_int_div_rem_memory<Torus>(
unsigned_mem = new unsigned_int_div_rem_memory<Torus, KSTorus>(
streams, params, num_blocks, allocate_gpu_memory, size_tracker);
if (is_signed) {
Torus sign_bit_pos = 31 - __builtin_clz(params.message_modulus) - 1;
// init memory objects for other integer operations
abs_mem_1 = new int_abs_buffer<Torus>(streams, params, num_blocks,
allocate_gpu_memory, size_tracker);
abs_mem_2 = new int_abs_buffer<Torus>(streams, params, num_blocks,
allocate_gpu_memory, size_tracker);
abs_mem_1 = new int_abs_buffer<Torus, KSTorus>(
streams, params, num_blocks, allocate_gpu_memory, size_tracker);
abs_mem_2 = new int_abs_buffer<Torus, KSTorus>(
streams, params, num_blocks, allocate_gpu_memory, size_tracker);
uint32_t requested_flag = outputFlag::FLAG_NONE;
scp_mem_1 = new int_sc_prop_memory<Torus>(
scp_mem_1 = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
scp_mem_2 = new int_sc_prop_memory<Torus>(
scp_mem_2 = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
@@ -1503,10 +1508,10 @@ template <typename Torus> struct int_div_rem_memory {
return (x >> sign_bit_pos) == 1;
};
cmux_quotient_mem = new int_cmux_buffer<Torus>(
cmux_quotient_mem = new int_cmux_buffer<Torus, KSTorus>(
streams, quotient_predicate_lut_f, params, num_blocks,
allocate_gpu_memory, size_tracker);
cmux_remainder_mem = new int_cmux_buffer<Torus>(
cmux_remainder_mem = new int_cmux_buffer<Torus, KSTorus>(
streams, remainder_predicate_lut_f, params, num_blocks,
allocate_gpu_memory, size_tracker);
// init temporary memory buffers
@@ -1548,7 +1553,7 @@ template <typename Torus> struct int_div_rem_memory {
return (Torus)(x_sign_bit != y_sign_bit);
};
compare_signed_bits_lut = new int_radix_lut<Torus>(
compare_signed_bits_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(

View File

@@ -1,11 +1,12 @@
#include "integer_utilities.h"
template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
template <typename Torus, typename KSTorus>
struct int_prepare_count_of_consecutive_bits_buffer {
int_radix_params params;
bool allocate_gpu_memory;
int_radix_lut<Torus> *univ_lut_mem;
int_radix_lut<Torus> *biv_lut_mem;
int_radix_lut<Torus, KSTorus> *univ_lut_mem;
int_radix_lut<Torus, KSTorus> *biv_lut_mem;
Direction direction;
BitValue bit_value;
@@ -22,11 +23,11 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
this->bit_value = bit_value;
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
this->univ_lut_mem =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
this->biv_lut_mem =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
const uint32_t num_bits = std::log2(this->params.message_modulus);
@@ -97,16 +98,18 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
}
};
template <typename Torus> struct int_count_of_consecutive_bits_buffer {
template <typename Torus, typename KSTorus>
struct int_count_of_consecutive_bits_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t counter_num_blocks;
int_prepare_count_of_consecutive_bits_buffer<Torus> *prepare_mem = nullptr;
int_prepare_count_of_consecutive_bits_buffer<Torus, KSTorus> *prepare_mem =
nullptr;
CudaRadixCiphertextFFI *ct_prepared = nullptr;
int_sum_ciphertexts_vec_memory<Torus> *sum_mem = nullptr;
int_sc_prop_memory<Torus> *propagate_mem = nullptr;
int_sum_ciphertexts_vec_memory<Torus, KSTorus> *sum_mem = nullptr;
int_sc_prop_memory<Torus, KSTorus> *propagate_mem = nullptr;
CudaRadixCiphertextFFI *cts = nullptr;
int_count_of_consecutive_bits_buffer(CudaStreams streams,
@@ -126,9 +129,10 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
streams.stream(0), streams.gpu_index(0), ct_prepared, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->prepare_mem = new int_prepare_count_of_consecutive_bits_buffer<Torus>(
streams, params, num_radix_blocks, direction, bit_value,
allocate_gpu_memory, size_tracker);
this->prepare_mem =
new int_prepare_count_of_consecutive_bits_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, direction, bit_value,
allocate_gpu_memory, size_tracker);
this->cts = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -136,11 +140,11 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
counter_num_blocks * num_radix_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->sum_mem = new int_sum_ciphertexts_vec_memory<Torus>(
this->sum_mem = new int_sum_ciphertexts_vec_memory<Torus, KSTorus>(
streams, params, counter_num_blocks, num_radix_blocks, true,
allocate_gpu_memory, size_tracker);
this->propagate_mem = new int_sc_prop_memory<Torus>(
this->propagate_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, counter_num_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
}
@@ -171,16 +175,16 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
};
template <typename Torus> struct int_ilog2_buffer {
template <typename Torus, typename KSTorus> struct int_ilog2_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t input_num_blocks;
uint32_t counter_num_blocks;
uint32_t num_bits_in_ciphertext;
int_prepare_count_of_consecutive_bits_buffer<Torus> *prepare_mem;
int_sum_ciphertexts_vec_memory<Torus> *sum_mem;
int_fullprop_buffer<Torus> *final_propagate_mem;
int_prepare_count_of_consecutive_bits_buffer<Torus, KSTorus> *prepare_mem;
int_sum_ciphertexts_vec_memory<Torus, KSTorus> *sum_mem;
int_fullprop_buffer<Torus, KSTorus> *final_propagate_mem;
CudaRadixCiphertextFFI *ct_in_buffer;
CudaRadixCiphertextFFI *sum_input_cts;
@@ -189,8 +193,8 @@ template <typename Torus> struct int_ilog2_buffer {
CudaRadixCiphertextFFI *carry_blocks_not;
CudaRadixCiphertextFFI *rotated_carry_blocks;
int_radix_lut<Torus> *lut_message_not;
int_radix_lut<Torus> *lut_carry_not;
int_radix_lut<Torus, KSTorus> *lut_message_not;
int_radix_lut<Torus, KSTorus> *lut_carry_not;
int_ilog2_buffer(CudaStreams streams, const int_radix_params params,
uint32_t input_num_blocks, uint32_t counter_num_blocks,
@@ -209,9 +213,10 @@ template <typename Torus> struct int_ilog2_buffer {
input_num_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
this->prepare_mem = new int_prepare_count_of_consecutive_bits_buffer<Torus>(
streams, params, input_num_blocks, Leading, Zero, allocate_gpu_memory,
size_tracker);
this->prepare_mem =
new int_prepare_count_of_consecutive_bits_buffer<Torus, KSTorus>(
streams, params, input_num_blocks, Leading, Zero,
allocate_gpu_memory, size_tracker);
uint32_t sum_input_total_blocks =
(input_num_blocks + 1) * counter_num_blocks;
@@ -221,7 +226,7 @@ template <typename Torus> struct int_ilog2_buffer {
sum_input_total_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
this->sum_mem = new int_sum_ciphertexts_vec_memory<Torus>(
this->sum_mem = new int_sum_ciphertexts_vec_memory<Torus, KSTorus>(
streams, params, counter_num_blocks, input_num_blocks + 1, false,
allocate_gpu_memory, size_tracker);
@@ -231,9 +236,9 @@ template <typename Torus> struct int_ilog2_buffer {
this->sum_output_not_propagated, counter_num_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->lut_message_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
allocate_gpu_memory, size_tracker);
this->lut_message_not = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, counter_num_blocks, allocate_gpu_memory,
size_tracker);
std::function<Torus(Torus)> lut_message_lambda =
[this](uint64_t x) -> uint64_t {
uint64_t message = x % this->params.message_modulus;
@@ -249,9 +254,9 @@ template <typename Torus> struct int_ilog2_buffer {
auto active_streams = streams.active_gpu_subset(counter_num_blocks);
lut_message_not->broadcast_lut(active_streams);
this->lut_carry_not =
new int_radix_lut<Torus>(streams, params, 1, counter_num_blocks,
allocate_gpu_memory, size_tracker);
this->lut_carry_not = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, counter_num_blocks, allocate_gpu_memory,
size_tracker);
std::function<Torus(Torus)> lut_carry_lambda =
[this](uint64_t x) -> uint64_t {
uint64_t carry = x / this->params.message_modulus;
@@ -283,7 +288,7 @@ template <typename Torus> struct int_ilog2_buffer {
counter_num_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
this->final_propagate_mem = new int_fullprop_buffer<Torus>(
this->final_propagate_mem = new int_fullprop_buffer<Torus, KSTorus>(
streams, params, allocate_gpu_memory, size_tracker);
}

View File

@@ -280,7 +280,7 @@ struct int_radix_params {
};
// Store things needed to apply LUTs
template <typename InputTorus, typename OutputTorus>
template <typename InputTorus, typename KSTorus, typename OutputTorus>
struct int_radix_lut_custom_input_output {
int_radix_params params;
// The number of blocks to be processed by the LUT. Can be
@@ -337,7 +337,7 @@ struct int_radix_lut_custom_input_output {
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<InputTorus *> lwe_array_in_vec;
std::vector<InputTorus *> lwe_after_ks_vec;
std::vector<KSTorus *> lwe_after_ks_vec;
std::vector<OutputTorus *> lwe_after_pbs_vec;
std::vector<InputTorus *> lwe_trivial_indexes_vec;
std::vector<ks_mem<InputTorus> *>
@@ -912,12 +912,12 @@ struct int_radix_lut_custom_input_output {
}
};
template <typename Torus, typename OutputTorus = Torus>
using int_radix_lut = int_radix_lut_custom_input_output<Torus, Torus>;
template <typename Torus, typename KSTorus, typename OutputTorus = Torus>
using int_radix_lut = int_radix_lut_custom_input_output<Torus, KSTorus, Torus>;
template <typename InputTorus>
template <typename InputTorus, typename KSTorus>
struct int_noise_squashing_lut
: int_radix_lut_custom_input_output<InputTorus, __uint128_t> {
: int_radix_lut_custom_input_output<InputTorus, KSTorus, __uint128_t> {
std::vector<InputTorus *> lwe_aligned_scatter_vec;
std::vector<__uint128_t *> lwe_aligned_gather_vec;
@@ -929,7 +929,7 @@ struct int_noise_squashing_lut
uint32_t original_num_blocks,
bool allocate_gpu_memory, uint64_t &size_tracker)
: int_radix_lut_custom_input_output<InputTorus, __uint128_t>(
: int_radix_lut_custom_input_output<InputTorus, KSTorus, __uint128_t>(
streams, input_glwe_dimension * input_polynomial_size, params, 1,
num_radix_blocks, original_num_blocks, allocate_gpu_memory,
size_tracker) {
@@ -946,15 +946,16 @@ struct int_noise_squashing_lut
this->broadcast_lut(this->active_streams);
}
using int_radix_lut_custom_input_output<InputTorus, __uint128_t>::release;
using int_radix_lut_custom_input_output<InputTorus, KSTorus,
__uint128_t>::release;
};
// Forward declarations for operation buffers
template <typename Torus> struct int_sub_and_propagate;
template <typename Torus, typename KSTorus> struct int_sub_and_propagate;
template <typename Torus> struct int_bit_extract_luts_buffer {
template <typename Torus, typename KSTorus> struct int_bit_extract_luts_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
int_radix_lut<Torus, KSTorus> *lut;
bool gpu_memory_allocated;
// With offset
@@ -966,9 +967,9 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
this->params = params;
gpu_memory_allocated = allocate_gpu_memory;
lut = new int_radix_lut<Torus>(streams, params, bits_per_block,
bits_per_block * num_radix_blocks,
allocate_gpu_memory, size_tracker);
lut = new int_radix_lut<Torus, KSTorus>(streams, params, bits_per_block,
bits_per_block * num_radix_blocks,
allocate_gpu_memory, size_tracker);
for (int i = 0; i < bits_per_block; i++) {
@@ -1051,10 +1052,10 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
}
};
template <typename Torus> struct int_fullprop_buffer {
template <typename Torus, typename KSTorus> struct int_fullprop_buffer {
int_radix_params params;
int_radix_lut<Torus> *lut;
int_radix_lut<Torus, KSTorus> *lut;
CudaRadixCiphertextFFI *tmp_small_lwe_vector;
CudaRadixCiphertextFFI *tmp_big_lwe_vector;
@@ -1064,8 +1065,8 @@ template <typename Torus> struct int_fullprop_buffer {
bool allocate_gpu_memory, uint64_t &size_tracker) {
this->params = params;
gpu_memory_allocated = allocate_gpu_memory;
lut = new int_radix_lut<Torus>(streams.get_ith(0), params, 2, 2,
allocate_gpu_memory, size_tracker);
lut = new int_radix_lut<Torus, KSTorus>(streams.get_ith(0), params, 2, 2,
allocate_gpu_memory, size_tracker);
// LUTs
auto lut_f_message = [params](Torus x) -> Torus {
@@ -1133,7 +1134,8 @@ template <typename Torus> struct int_fullprop_buffer {
}
};
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
template <typename Torus, typename KSTorus>
struct int_sum_ciphertexts_vec_memory {
int_radix_params params;
uint32_t max_total_blocks_in_vec;
@@ -1158,7 +1160,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
uint64_t *d_degrees;
// lookup table for extracting message and carry
int_radix_lut<Torus> *luts_message_carry;
int_radix_lut<Torus, KSTorus> *luts_message_carry;
bool mem_reuse = false;
bool allocated_luts_message_carry;
@@ -1223,7 +1225,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
if (total_ciphertexts > 0 ||
reduce_degrees_for_single_carry_propagation) {
uint64_t size_tracker = 0;
luts_message_carry = new int_radix_lut<Torus>(
luts_message_carry = new int_radix_lut<Torus, KSTorus>(
streams, params, 2, pbs_count, true, size_tracker);
allocated_luts_message_carry = true;
uint64_t message_modulus_bits =
@@ -1293,9 +1295,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
uint32_t max_pbs_count = std::max(
2 * (max_total_blocks_in_vec / chunk_size), 2 * num_blocks_in_radix);
if (max_pbs_count > 0) {
int_radix_lut<Torus> *luts_message_carry_dry_run =
new int_radix_lut<Torus>(streams, params, 2, max_pbs_count, false,
size_tracker);
int_radix_lut<Torus, KSTorus> *luts_message_carry_dry_run =
new int_radix_lut<Torus, KSTorus>(streams, params, 2, max_pbs_count,
false, size_tracker);
luts_message_carry_dry_run->release(streams);
delete luts_message_carry_dry_run;
}
@@ -1318,7 +1320,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
CudaRadixCiphertextFFI *current_blocks,
CudaRadixCiphertextFFI *small_lwe_vector,
int_radix_lut<Torus> *reused_lut,
int_radix_lut<Torus, KSTorus> *reused_lut,
bool reduce_degrees_for_single_carry_propagation,
bool allocate_gpu_memory, uint64_t &size_tracker) {
this->mem_reuse = true;
@@ -1391,10 +1393,10 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
};
// For sequential algorithm in group propagation
template <typename Torus> struct int_seq_group_prop_memory {
template <typename Torus, typename KSTorus> struct int_seq_group_prop_memory {
CudaRadixCiphertextFFI *group_resolved_carries;
int_radix_lut<Torus> *lut_sequential_algorithm;
int_radix_lut<Torus, KSTorus> *lut_sequential_algorithm;
uint32_t grouping_size;
bool gpu_memory_allocated;
@@ -1416,9 +1418,9 @@ template <typename Torus> struct int_seq_group_prop_memory {
int num_seq_luts = grouping_size - 1;
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
lut_sequential_algorithm =
new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
allocate_gpu_memory, size_tracker);
lut_sequential_algorithm = new int_radix_lut<Torus, KSTorus>(
streams, params, num_seq_luts, num_seq_luts, allocate_gpu_memory,
size_tracker);
for (int index = 0; index < num_seq_luts; index++) {
auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
return (propa_cum_sum_block >> (index + 1)) & 1;
@@ -1452,9 +1454,9 @@ template <typename Torus> struct int_seq_group_prop_memory {
};
// For hillis steele algorithm in group propagation
template <typename Torus> struct int_hs_group_prop_memory {
template <typename Torus, typename KSTorus> struct int_hs_group_prop_memory {
int_radix_lut<Torus> *lut_hillis_steele;
int_radix_lut<Torus, KSTorus> *lut_hillis_steele;
bool gpu_memory_allocated;
int_hs_group_prop_memory(CudaStreams streams, int_radix_params params,
@@ -1481,7 +1483,7 @@ template <typename Torus> struct int_hs_group_prop_memory {
}
};
lut_hillis_steele = new int_radix_lut<Torus>(
lut_hillis_steele = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
@@ -1502,12 +1504,13 @@ template <typename Torus> struct int_hs_group_prop_memory {
};
// compute_shifted_blocks_and_block_states
template <typename Torus> struct int_shifted_blocks_and_states_memory {
template <typename Torus, typename KSTorus>
struct int_shifted_blocks_and_states_memory {
CudaRadixCiphertextFFI *shifted_blocks_and_states;
CudaRadixCiphertextFFI *shifted_blocks;
CudaRadixCiphertextFFI *block_states;
int_radix_lut<Torus> *luts_array_first_step;
int_radix_lut<Torus, KSTorus> *luts_array_first_step;
bool gpu_memory_allocated;
int_shifted_blocks_and_states_memory(
@@ -1538,7 +1541,7 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
uint32_t num_luts_first_step = 2 * grouping_size + 1;
luts_array_first_step = new int_radix_lut<Torus>(
luts_array_first_step = new int_radix_lut<Torus, KSTorus>(
streams, params, num_luts_first_step, num_radix_blocks, num_many_lut,
allocate_gpu_memory, size_tracker);
@@ -1690,7 +1693,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
};
// compute_propagation simulator and group carries
template <typename Torus> struct int_prop_simu_group_carries_memory {
template <typename Torus, typename KSTorus>
struct int_prop_simu_group_carries_memory {
CudaRadixCiphertextFFI *propagation_cum_sums;
CudaRadixCiphertextFFI *simulators;
CudaRadixCiphertextFFI *prepared_blocks;
@@ -1700,10 +1704,10 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
Torus *scalar_array_cum_sum;
Torus *h_scalar_array_cum_sum;
int_radix_lut<Torus> *luts_array_second_step;
int_radix_lut<Torus, KSTorus> *luts_array_second_step;
int_seq_group_prop_memory<Torus> *seq_group_prop_mem;
int_hs_group_prop_memory<Torus> *hs_group_prop_mem;
int_seq_group_prop_memory<Torus, KSTorus> *seq_group_prop_mem;
int_hs_group_prop_memory<Torus, KSTorus> *hs_group_prop_mem;
uint32_t group_size;
bool use_sequential_algorithm_to_resolve_group_carries;
@@ -1782,7 +1786,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
}
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
luts_array_second_step = new int_radix_lut<Torus>(
luts_array_second_step = new int_radix_lut<Torus, KSTorus>(
streams, params, num_luts_second_step, num_radix_blocks,
allocate_gpu_memory, size_tracker);
@@ -1937,12 +1941,12 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
if (use_sequential_algorithm_to_resolve_group_carries) {
seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
seq_group_prop_mem = new int_seq_group_prop_memory<Torus, KSTorus>(
streams, params, grouping_size, big_lwe_size_bytes,
allocate_gpu_memory, size_tracker);
} else {
hs_group_prop_mem = new int_hs_group_prop_memory<Torus>(
hs_group_prop_mem = new int_hs_group_prop_memory<Torus, KSTorus>(
streams, params, num_groups, big_lwe_size_bytes, allocate_gpu_memory,
size_tracker);
}
@@ -2001,7 +2005,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
};
};
template <typename Torus> struct int_sc_prop_memory {
template <typename Torus, typename KSTorus> struct int_sc_prop_memory {
uint32_t num_many_lut;
uint32_t lut_stride;
@@ -2009,12 +2013,14 @@ template <typename Torus> struct int_sc_prop_memory {
CudaRadixCiphertextFFI *output_flag;
CudaRadixCiphertextFFI *last_lhs;
CudaRadixCiphertextFFI *last_rhs;
int_radix_lut<Torus> *lut_message_extract;
int_radix_lut<Torus, KSTorus> *lut_message_extract;
int_radix_lut<Torus> *lut_overflow_flag_prep;
int_radix_lut<Torus, KSTorus> *lut_overflow_flag_prep;
int_shifted_blocks_and_states_memory<Torus> *shifted_blocks_state_mem;
int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
int_shifted_blocks_and_states_memory<Torus, KSTorus>
*shifted_blocks_state_mem;
int_prop_simu_group_carries_memory<Torus, KSTorus>
*prop_simu_group_carries_mem;
int_radix_params params;
uint32_t requested_flag;
@@ -2040,18 +2046,20 @@ template <typename Torus> struct int_sc_prop_memory {
uint32_t box_size = polynomial_size / block_modulus;
lut_stride = (block_modulus / num_many_lut) * box_size;
shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory<Torus>(
streams, params, num_radix_blocks, num_many_lut, grouping_size,
allocate_gpu_memory, size_tracker);
shifted_blocks_state_mem =
new int_shifted_blocks_and_states_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, num_many_lut, grouping_size,
allocate_gpu_memory, size_tracker);
prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
streams, params, num_radix_blocks, grouping_size, num_groups,
allocate_gpu_memory, size_tracker);
prop_simu_group_carries_mem =
new int_prop_simu_group_carries_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, grouping_size, num_groups,
allocate_gpu_memory, size_tracker);
// Step 3 elements
int num_luts_message_extract =
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
lut_message_extract = new int_radix_lut<Torus>(
lut_message_extract = new int_radix_lut<Torus, KSTorus>(
streams, params, num_luts_message_extract, num_radix_blocks + 1,
allocate_gpu_memory, size_tracker);
// lut for the first block in the first grouping
@@ -2086,7 +2094,7 @@ template <typename Torus> struct int_sc_prop_memory {
// For step 1 overflow should be enable only if flag overflow
uint32_t num_bits_in_message = std::log2(message_modulus);
lut_overflow_flag_prep = new int_radix_lut<Torus>(
lut_overflow_flag_prep = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
auto f_overflow_fp = [num_bits_in_message](Torus lhs,
@@ -2226,12 +2234,13 @@ template <typename Torus> struct int_sc_prop_memory {
};
};
template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
template <typename Torus, typename KSTorus>
struct int_shifted_blocks_and_borrow_states_memory {
CudaRadixCiphertextFFI *shifted_blocks_and_borrow_states;
CudaRadixCiphertextFFI *shifted_blocks;
CudaRadixCiphertextFFI *borrow_states;
int_radix_lut<Torus> *luts_array_first_step;
int_radix_lut<Torus, KSTorus> *luts_array_first_step;
bool gpu_memory_allocated;
int_shifted_blocks_and_borrow_states_memory(
@@ -2263,7 +2272,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
uint32_t num_luts_first_step = 2 * grouping_size + 1;
luts_array_first_step = new int_radix_lut<Torus>(
luts_array_first_step = new int_radix_lut<Torus, KSTorus>(
streams, params, num_luts_first_step, num_radix_blocks, num_many_lut,
allocate_gpu_memory, size_tracker);
@@ -2427,7 +2436,7 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
};
};
template <typename Torus> struct int_borrow_prop_memory {
template <typename Torus, typename KSTorus> struct int_borrow_prop_memory {
uint32_t num_many_lut;
uint32_t lut_stride;
@@ -2435,12 +2444,13 @@ template <typename Torus> struct int_borrow_prop_memory {
uint32_t num_groups;
CudaRadixCiphertextFFI *overflow_block;
int_radix_lut<Torus> *lut_message_extract;
int_radix_lut<Torus> *lut_borrow_flag;
int_radix_lut<Torus, KSTorus> *lut_message_extract;
int_radix_lut<Torus, KSTorus> *lut_borrow_flag;
int_shifted_blocks_and_borrow_states_memory<Torus>
int_shifted_blocks_and_borrow_states_memory<Torus, KSTorus>
*shifted_blocks_borrow_state_mem;
int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
int_prop_simu_group_carries_memory<Torus, KSTorus>
*prop_simu_group_carries_mem;
int_radix_params params;
@@ -2472,13 +2482,14 @@ template <typename Torus> struct int_borrow_prop_memory {
lut_stride = (block_modulus / num_many_lut) * box_size;
shifted_blocks_borrow_state_mem =
new int_shifted_blocks_and_borrow_states_memory<Torus>(
new int_shifted_blocks_and_borrow_states_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, num_many_lut, grouping_size,
allocate_gpu_memory, size_tracker);
prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
streams, params, num_radix_blocks, grouping_size, num_groups,
allocate_gpu_memory, size_tracker);
prop_simu_group_carries_mem =
new int_prop_simu_group_carries_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, grouping_size, num_groups,
allocate_gpu_memory, size_tracker);
overflow_block = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -2486,8 +2497,8 @@ template <typename Torus> struct int_borrow_prop_memory {
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
lut_message_extract =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
// lut for the first block in the first grouping
auto f_message_extract = [message_modulus](Torus block) -> Torus {
return (block >> 1) % message_modulus;
@@ -2504,9 +2515,9 @@ template <typename Torus> struct int_borrow_prop_memory {
lut_message_extract->broadcast_lut(active_streams);
if (compute_overflow) {
lut_borrow_flag =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
lut_borrow_flag = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
// lut for the first block in the first grouping
auto f_borrow_flag = [](Torus block) -> Torus {
return ((block >> 2) & 1);

View File

@@ -2,17 +2,17 @@
#include "cmux.h"
#include "integer_utilities.h"
template <typename Torus> struct int_mul_memory {
template <typename Torus, typename KSTorus> struct int_mul_memory {
CudaRadixCiphertextFFI *vector_result_sb;
CudaRadixCiphertextFFI *block_mul_res;
CudaRadixCiphertextFFI *small_lwe_vector;
int_radix_lut<Torus> *luts_array; // lsb msb
int_radix_lut<Torus> *zero_out_predicate_lut;
int_radix_lut<Torus, KSTorus> *luts_array; // lsb msb
int_radix_lut<Torus, KSTorus> *zero_out_predicate_lut;
int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_mem;
int_sc_prop_memory<Torus> *sc_prop_mem;
int_zero_out_if_buffer<Torus> *zero_out_mem;
int_sum_ciphertexts_vec_memory<Torus, KSTorus> *sum_ciphertexts_mem;
int_sc_prop_memory<Torus, KSTorus> *sc_prop_mem;
int_zero_out_if_buffer<Torus, KSTorus> *zero_out_mem;
int_radix_params params;
bool boolean_mul = false;
@@ -34,9 +34,9 @@ template <typename Torus> struct int_mul_memory {
else
return block;
};
zero_out_predicate_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
zero_out_predicate_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0),
zero_out_predicate_lut->get_lut(0, 0),
@@ -48,7 +48,7 @@ template <typename Torus> struct int_mul_memory {
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
zero_out_predicate_lut->broadcast_lut(active_streams);
zero_out_mem = new int_zero_out_if_buffer<Torus>(
zero_out_mem = new int_zero_out_if_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
return;
@@ -88,7 +88,8 @@ template <typename Torus> struct int_mul_memory {
// create int_radix_lut objects for lsb, msb, message, carry
// luts_array -> lut = {lsb_acc, msb_acc}
luts_array = new int_radix_lut<Torus>(streams, params, 2, total_block_count,
luts_array =
new int_radix_lut<Torus, KSTorus>(streams, params, 2, total_block_count,
allocate_gpu_memory, size_tracker);
auto lsb_acc = luts_array->get_lut(0, 0);
auto msb_acc = luts_array->get_lut(0, 1);
@@ -125,12 +126,12 @@ template <typename Torus> struct int_mul_memory {
auto active_streams = streams.active_gpu_subset(total_block_count);
luts_array->broadcast_lut(active_streams);
// create memory object for sum ciphertexts
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, 2 * num_radix_blocks,
vector_result_sb, small_lwe_vector, luts_array, true,
allocate_gpu_memory, size_tracker);
uint32_t requested_flag = outputFlag::FLAG_NONE;
sc_prop_mem = new int_sc_prop_memory<Torus>(
sc_prop_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
}

View File

@@ -1,14 +1,15 @@
#pragma once
#include "integer_utilities.h"
template <typename Torus> struct int_scalar_mul_buffer;
template <typename Torus> struct int_logical_scalar_shift_buffer;
template <typename Torus, typename KSTorus> struct int_scalar_mul_buffer;
template <typename Torus, typename KSTorus>
struct int_logical_scalar_shift_buffer;
template <typename Torus> struct int_grouped_oprf_memory {
template <typename Torus, typename KSTorus> struct int_grouped_oprf_memory {
int_radix_params params;
bool allocate_gpu_memory;
int_radix_lut<Torus> *luts;
int_radix_lut<Torus, KSTorus> *luts;
CudaRadixCiphertextFFI *plaintext_corrections;
Torus *h_lut_indexes;
@@ -32,7 +33,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->luts = new int_radix_lut<Torus>(
this->luts = new int_radix_lut<Torus, KSTorus>(
streams, params, message_bits_per_block, num_blocks_to_process,
allocate_gpu_memory, size_tracker);
@@ -149,13 +150,14 @@ template <typename Torus> struct int_grouped_oprf_memory {
}
};
template <typename Torus> struct int_grouped_oprf_custom_range_memory {
template <typename Torus, typename KSTorus>
struct int_grouped_oprf_custom_range_memory {
int_radix_params params;
bool allocate_gpu_memory;
int_grouped_oprf_memory<Torus> *grouped_oprf_memory;
int_scalar_mul_buffer<Torus> *scalar_mul_buffer;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
int_grouped_oprf_memory<Torus, KSTorus> *grouped_oprf_memory;
int_scalar_mul_buffer<Torus, KSTorus> *scalar_mul_buffer;
int_logical_scalar_shift_buffer<Torus, KSTorus> *logical_scalar_shift_buffer;
CudaRadixCiphertextFFI *tmp_oprf_output;
uint32_t num_random_input_blocks;
@@ -171,16 +173,16 @@ template <typename Torus> struct int_grouped_oprf_custom_range_memory {
(num_input_random_bits + message_bits_per_block - 1) /
message_bits_per_block;
this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus>(
this->grouped_oprf_memory = new int_grouped_oprf_memory<Torus, KSTorus>(
streams, params, this->num_random_input_blocks, message_bits_per_block,
num_input_random_bits, allocate_gpu_memory, size_tracker);
this->scalar_mul_buffer = new int_scalar_mul_buffer<Torus>(
this->scalar_mul_buffer = new int_scalar_mul_buffer<Torus, KSTorus>(
streams, params, num_blocks_intermediate, num_scalar_bits,
allocate_gpu_memory, true, size_tracker);
this->logical_scalar_shift_buffer =
new int_logical_scalar_shift_buffer<Torus>(
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, RIGHT_SHIFT, params, num_blocks_intermediate,
allocate_gpu_memory, size_tracker);

View File

@@ -4,16 +4,17 @@
#include "integer_utilities.h"
#include "scalar_mul.h"
template <typename Torus> struct int_unsigned_scalar_div_mem {
template <typename Torus, typename KSTorus> struct int_unsigned_scalar_div_mem {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *tmp_ffi = nullptr;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem = nullptr;
int_scalar_mul_high_buffer<Torus> *scalar_mul_high_mem = nullptr;
int_sc_prop_memory<Torus> *scp_mem = nullptr;
int_sub_and_propagate<Torus> *sub_and_propagate_mem = nullptr;
int_logical_scalar_shift_buffer<Torus, KSTorus> *logical_scalar_shift_mem =
nullptr;
int_scalar_mul_high_buffer<Torus, KSTorus> *scalar_mul_high_mem = nullptr;
int_sc_prop_memory<Torus, KSTorus> *scp_mem = nullptr;
int_sub_and_propagate<Torus, KSTorus> *sub_and_propagate_mem = nullptr;
int_unsigned_scalar_div_mem(CudaStreams streams,
const int_radix_params params,
@@ -28,9 +29,10 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
if (!scalar_divisor_ffi->is_abs_divisor_one) {
if (scalar_divisor_ffi->is_divisor_pow2) {
logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
streams, RIGHT_SHIFT, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
logical_scalar_shift_mem =
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
} else if (scalar_divisor_ffi->divisor_has_more_bits_than_numerator) {
@@ -42,16 +44,17 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
} else if (scalar_divisor_ffi
->is_chosen_multiplier_geq_two_pow_numerator) {
logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
streams, RIGHT_SHIFT, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
scalar_mul_high_mem = new int_scalar_mul_high_buffer<Torus>(
logical_scalar_shift_mem =
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
scalar_mul_high_mem = new int_scalar_mul_high_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, scalar_divisor_ffi->active_bits,
allocate_gpu_memory, size_tracker);
scp_mem = new int_sc_prop_memory<Torus>(
scp_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
sub_and_propagate_mem = new int_sub_and_propagate<Torus, KSTorus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
tmp_ffi = new CudaRadixCiphertextFFI;
@@ -61,10 +64,11 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
} else {
logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
streams, RIGHT_SHIFT, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
scalar_mul_high_mem = new int_scalar_mul_high_buffer<Torus>(
logical_scalar_shift_mem =
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
scalar_mul_high_mem = new int_scalar_mul_high_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, scalar_divisor_ffi->active_bits,
allocate_gpu_memory, size_tracker);
}
@@ -98,19 +102,21 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
}
};
template <typename Torus> struct int_signed_scalar_div_mem {
template <typename Torus, typename KSTorus> struct int_signed_scalar_div_mem {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *tmp_ffi = nullptr;
CudaRadixCiphertextFFI *xsign_ffi = nullptr;
int_arithmetic_scalar_shift_buffer<Torus> *arithmetic_scalar_shift_mem =
int_arithmetic_scalar_shift_buffer<Torus, KSTorus>
*arithmetic_scalar_shift_mem = nullptr;
int_logical_scalar_shift_buffer<Torus, KSTorus> *logical_scalar_shift_mem =
nullptr;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem = nullptr;
int_signed_scalar_mul_high_buffer<Torus> *scalar_mul_high_mem = nullptr;
int_sc_prop_memory<Torus> *scp_mem = nullptr;
int_sub_and_propagate<Torus> *sub_and_propagate_mem = nullptr;
int_signed_scalar_mul_high_buffer<Torus, KSTorus> *scalar_mul_high_mem =
nullptr;
int_sc_prop_memory<Torus, KSTorus> *scp_mem = nullptr;
int_sub_and_propagate<Torus, KSTorus> *sub_and_propagate_mem = nullptr;
int_signed_scalar_div_mem(CudaStreams streams, const int_radix_params params,
uint32_t num_radix_blocks,
@@ -139,16 +145,17 @@ template <typename Torus> struct int_signed_scalar_div_mem {
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
arithmetic_scalar_shift_mem =
new int_arithmetic_scalar_shift_buffer<Torus>(
new int_arithmetic_scalar_shift_buffer<Torus, KSTorus>(
streams, RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
if (scalar_divisor_ffi->is_divisor_pow2) {
logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
streams, RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
scp_mem = new int_sc_prop_memory<Torus>(
logical_scalar_shift_mem =
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
scp_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
@@ -160,17 +167,18 @@ template <typename Torus> struct int_signed_scalar_div_mem {
num_radix_blocks, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
scalar_mul_high_mem = new int_signed_scalar_mul_high_buffer<Torus>(
streams, params, num_radix_blocks,
scalar_divisor_ffi->active_bits, allocate_gpu_memory,
size_tracker);
scalar_mul_high_mem =
new int_signed_scalar_mul_high_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks,
scalar_divisor_ffi->active_bits, allocate_gpu_memory,
size_tracker);
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
sub_and_propagate_mem = new int_sub_and_propagate<Torus, KSTorus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
scp_mem = new int_sc_prop_memory<Torus>(
scp_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, FLAG_NONE,
allocate_gpu_memory, size_tracker);
}
@@ -215,16 +223,17 @@ template <typename Torus> struct int_signed_scalar_div_mem {
}
};
template <typename Torus> struct int_unsigned_scalar_div_rem_buffer {
template <typename Torus, typename KSTorus>
struct int_unsigned_scalar_div_rem_buffer {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *numerator_ct;
int_unsigned_scalar_div_mem<Torus> *unsigned_div_mem;
int_bitop_buffer<Torus> *bitop_mem = nullptr;
int_scalar_mul_buffer<Torus> *scalar_mul_mem = nullptr;
int_sub_and_propagate<Torus> *sub_and_propagate_mem = nullptr;
int_unsigned_scalar_div_mem<Torus, KSTorus> *unsigned_div_mem;
int_bitop_buffer<Torus, KSTorus> *bitop_mem = nullptr;
int_scalar_mul_buffer<Torus, KSTorus> *scalar_mul_mem = nullptr;
int_sub_and_propagate<Torus, KSTorus> *sub_and_propagate_mem = nullptr;
int_unsigned_scalar_div_rem_buffer(
CudaStreams streams, const int_radix_params params,
@@ -240,22 +249,22 @@ template <typename Torus> struct int_unsigned_scalar_div_rem_buffer {
streams.stream(0), streams.gpu_index(0), numerator_ct, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->unsigned_div_mem = new int_unsigned_scalar_div_mem<Torus>(
this->unsigned_div_mem = new int_unsigned_scalar_div_mem<Torus, KSTorus>(
streams, params, num_radix_blocks, scalar_divisor_ffi,
allocate_gpu_memory, size_tracker);
if (scalar_divisor_ffi->is_divisor_pow2) {
this->bitop_mem = new int_bitop_buffer<Torus>(
this->bitop_mem = new int_bitop_buffer<Torus, KSTorus>(
streams, SCALAR_BITAND, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
} else {
if (!scalar_divisor_ffi->is_divisor_zero &&
!scalar_divisor_ffi->is_abs_divisor_one && num_radix_blocks != 0) {
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, active_bits_divisor,
allocate_gpu_memory, true, size_tracker);
}
this->sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
this->sub_and_propagate_mem = new int_sub_and_propagate<Torus, KSTorus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
}
@@ -286,17 +295,19 @@ template <typename Torus> struct int_unsigned_scalar_div_rem_buffer {
}
};
template <typename Torus> struct int_signed_scalar_div_rem_buffer {
template <typename Torus, typename KSTorus>
struct int_signed_scalar_div_rem_buffer {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *numerator_ct;
int_signed_scalar_div_mem<Torus> *signed_div_mem;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem = nullptr;
int_scalar_mul_buffer<Torus> *scalar_mul_mem = nullptr;
int_sub_and_propagate<Torus> *sub_and_propagate_mem;
int_sc_prop_memory<Torus> *scp_mem;
int_signed_scalar_div_mem<Torus, KSTorus> *signed_div_mem;
int_logical_scalar_shift_buffer<Torus, KSTorus> *logical_scalar_shift_mem =
nullptr;
int_scalar_mul_buffer<Torus, KSTorus> *scalar_mul_mem = nullptr;
int_sub_and_propagate<Torus, KSTorus> *sub_and_propagate_mem;
int_sc_prop_memory<Torus, KSTorus> *scp_mem;
int_signed_scalar_div_rem_buffer(
CudaStreams streams, const int_radix_params params,
@@ -312,11 +323,11 @@ template <typename Torus> struct int_signed_scalar_div_rem_buffer {
streams.stream(0), streams.gpu_index(0), numerator_ct, num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->signed_div_mem = new int_signed_scalar_div_mem<Torus>(
this->signed_div_mem = new int_signed_scalar_div_mem<Torus, KSTorus>(
streams, params, num_radix_blocks, scalar_divisor_ffi,
allocate_gpu_memory, size_tracker);
this->scp_mem = new int_sc_prop_memory<Torus>(
this->scp_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
@@ -326,18 +337,18 @@ template <typename Torus> struct int_signed_scalar_div_rem_buffer {
if (!scalar_divisor_ffi->is_divisor_negative &&
scalar_divisor_ffi->is_divisor_pow2) {
this->logical_scalar_shift_mem =
new int_logical_scalar_shift_buffer<Torus>(
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, LEFT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
} else if (!scalar_divisor_ffi->is_divisor_zero && !is_divisor_one &&
num_radix_blocks != 0) {
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, active_bits_divisor,
allocate_gpu_memory, true, size_tracker);
}
this->sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
this->sub_and_propagate_mem = new int_sub_and_propagate<Torus, KSTorus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
}

View File

@@ -2,13 +2,13 @@
#include "integer_utilities.h"
#include "scalar_shifts.h"
template <typename Torus> struct int_scalar_mul_buffer {
template <typename Torus, typename KSTorus> struct int_scalar_mul_buffer {
int_radix_params params;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_buffer;
int_sum_ciphertexts_vec_memory<Torus> *sum_ciphertexts_vec_mem;
int_logical_scalar_shift_buffer<Torus, KSTorus> *logical_scalar_shift_buffer;
int_sum_ciphertexts_vec_memory<Torus, KSTorus> *sum_ciphertexts_vec_mem;
CudaRadixCiphertextFFI *preshifted_buffer;
CudaRadixCiphertextFFI *all_shifted_buffer;
int_sc_prop_memory<Torus> *sc_prop_mem;
int_sc_prop_memory<Torus, KSTorus> *sc_prop_mem;
bool anticipated_buffers_drop;
bool gpu_memory_allocated;
uint32_t num_ciphertext_bits;
@@ -41,22 +41,25 @@ template <typename Torus> struct int_scalar_mul_buffer {
size_tracker, allocate_gpu_memory);
if (num_ciphertext_bits * num_radix_blocks >= num_radix_blocks + 2)
logical_scalar_shift_buffer = new int_logical_scalar_shift_buffer<Torus>(
streams, LEFT_SHIFT, params, num_radix_blocks, allocate_gpu_memory,
all_shifted_buffer, anticipated_drop_mem);
logical_scalar_shift_buffer =
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, LEFT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, all_shifted_buffer, anticipated_drop_mem);
else
logical_scalar_shift_buffer = new int_logical_scalar_shift_buffer<Torus>(
streams, LEFT_SHIFT, params, num_radix_blocks, allocate_gpu_memory,
anticipated_drop_mem);
logical_scalar_shift_buffer =
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, LEFT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, anticipated_drop_mem);
uint64_t last_step_mem = 0;
if (num_ciphertext_bits > 0) {
sum_ciphertexts_vec_mem = new int_sum_ciphertexts_vec_memory<Torus>(
streams, params, num_radix_blocks, num_ciphertext_bits, true,
allocate_gpu_memory, last_step_mem);
sum_ciphertexts_vec_mem =
new int_sum_ciphertexts_vec_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, num_ciphertext_bits, true,
allocate_gpu_memory, last_step_mem);
}
uint32_t requested_flag = outputFlag::FLAG_NONE;
sc_prop_mem = new int_sc_prop_memory<Torus>(
sc_prop_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
last_step_mem);
if (anticipated_buffer_drop) {
@@ -96,12 +99,12 @@ template <typename Torus> struct int_scalar_mul_buffer {
}
};
template <typename Torus> struct int_scalar_mul_high_buffer {
template <typename Torus, typename KSTorus> struct int_scalar_mul_high_buffer {
int_radix_params params;
bool allocate_gpu_memory;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem;
int_scalar_mul_buffer<Torus> *scalar_mul_mem;
int_logical_scalar_shift_buffer<Torus, KSTorus> *logical_scalar_shift_mem;
int_scalar_mul_buffer<Torus, KSTorus> *scalar_mul_mem;
CudaRadixCiphertextFFI *tmp;
@@ -114,11 +117,12 @@ template <typename Torus> struct int_scalar_mul_high_buffer {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
streams, RIGHT_SHIFT, params, 2 * num_radix_blocks, allocate_gpu_memory,
size_tracker);
this->logical_scalar_shift_mem =
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, RIGHT_SHIFT, params, 2 * num_radix_blocks,
allocate_gpu_memory, size_tracker);
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus, KSTorus>(
streams, params, 2 * num_radix_blocks, num_scalar_bits,
allocate_gpu_memory, true, size_tracker);
@@ -144,13 +148,14 @@ template <typename Torus> struct int_scalar_mul_high_buffer {
}
};
template <typename Torus> struct int_signed_scalar_mul_high_buffer {
template <typename Torus, typename KSTorus>
struct int_signed_scalar_mul_high_buffer {
int_radix_params params;
bool allocate_gpu_memory;
int_logical_scalar_shift_buffer<Torus> *logical_scalar_shift_mem;
int_scalar_mul_buffer<Torus> *scalar_mul_mem;
int_extend_radix_with_sign_msb_buffer<Torus> *extend_radix_mem;
int_logical_scalar_shift_buffer<Torus, KSTorus> *logical_scalar_shift_mem;
int_scalar_mul_buffer<Torus, KSTorus> *scalar_mul_mem;
int_extend_radix_with_sign_msb_buffer<Torus, KSTorus> *extend_radix_mem;
CudaRadixCiphertextFFI *tmp;
@@ -164,11 +169,12 @@ template <typename Torus> struct int_signed_scalar_mul_high_buffer {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->logical_scalar_shift_mem = new int_logical_scalar_shift_buffer<Torus>(
streams, RIGHT_SHIFT, params, 2 * num_radix_blocks, allocate_gpu_memory,
size_tracker);
this->logical_scalar_shift_mem =
new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, RIGHT_SHIFT, params, 2 * num_radix_blocks,
allocate_gpu_memory, size_tracker);
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus>(
this->scalar_mul_mem = new int_scalar_mul_buffer<Torus, KSTorus>(
streams, params, 2 * num_radix_blocks, num_scalar_bits,
allocate_gpu_memory, true, size_tracker);
@@ -177,9 +183,10 @@ template <typename Torus> struct int_signed_scalar_mul_high_buffer {
streams.stream(0), streams.gpu_index(0), tmp, 2 * num_radix_blocks,
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
this->extend_radix_mem = new int_extend_radix_with_sign_msb_buffer<Torus>(
streams, params, num_radix_blocks, num_radix_blocks,
allocate_gpu_memory, size_tracker);
this->extend_radix_mem =
new int_extend_radix_with_sign_msb_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, num_radix_blocks,
allocate_gpu_memory, size_tracker);
}
void release(CudaStreams streams) {

View File

@@ -1,9 +1,10 @@
#pragma once
#include "integer_utilities.h"
template <typename Torus> struct int_logical_scalar_shift_buffer {
template <typename Torus, typename KSTorus>
struct int_logical_scalar_shift_buffer {
int_radix_params params;
std::vector<int_radix_lut<Torus> *> lut_buffers_bivariate;
std::vector<int_radix_lut<Torus, KSTorus> *> lut_buffers_bivariate;
SHIFT_OR_ROTATE_TYPE shift_type;
@@ -46,9 +47,9 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
// so that in case an application calls scratches only once for a whole
// circuit it can reuse memory for different shift values
for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
auto cur_lut_bivariate =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
auto cur_lut_bivariate = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
uint32_t shift_within_block = s_w_b;
@@ -132,9 +133,9 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
// so that in case an application calls scratches only once for a whole
// circuit it can reuse memory for different shift values
for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
auto cur_lut_bivariate =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
auto cur_lut_bivariate = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
uint32_t shift_within_block = s_w_b;
@@ -199,10 +200,11 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
}
};
template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
template <typename Torus, typename KSTorus>
struct int_arithmetic_scalar_shift_buffer {
int_radix_params params;
std::vector<int_radix_lut<Torus> *> lut_buffers_univariate;
std::vector<int_radix_lut<Torus> *> lut_buffers_bivariate;
std::vector<int_radix_lut<Torus, KSTorus> *> lut_buffers_univariate;
std::vector<int_radix_lut<Torus, KSTorus> *> lut_buffers_bivariate;
SHIFT_OR_ROTATE_TYPE shift_type;
@@ -246,7 +248,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
// circuit it can reuse memory for different shift values
// With two bits of message this is actually only one LUT.
for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
auto shift_last_block_lut_univariate = new int_radix_lut<Torus>(
auto shift_last_block_lut_univariate = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
uint32_t shift_within_block = s_w_b;
@@ -282,7 +284,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
}
auto padding_block_lut_univariate = new int_radix_lut<Torus>(
auto padding_block_lut_univariate = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
// lut to compute the padding block
@@ -313,9 +315,9 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
// circuit it can reuse memory for different shift values
// NB: with two bits of message, this is actually only one LUT.
for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
auto shift_blocks_lut_bivariate =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
auto shift_blocks_lut_bivariate = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory,
size_tracker);
uint32_t shift_within_block = s_w_b;

View File

@@ -1,7 +1,7 @@
#pragma once
#include "integer_utilities.h"
template <typename Torus> struct int_shift_and_rotate_buffer {
template <typename Torus, typename KSTorus> struct int_shift_and_rotate_buffer {
int_radix_params params;
SHIFT_OR_ROTATE_TYPE shift_type;
bool is_signed;
@@ -13,10 +13,10 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
CudaRadixCiphertextFFI *tmp_input_bits_b;
CudaRadixCiphertextFFI *tmp_mux_inputs;
int_bit_extract_luts_buffer<Torus> *bit_extract_luts;
int_bit_extract_luts_buffer<Torus> *bit_extract_luts_with_offset_2;
int_radix_lut<Torus> *mux_lut;
int_radix_lut<Torus> *cleaning_lut;
int_bit_extract_luts_buffer<Torus, KSTorus> *bit_extract_luts;
int_bit_extract_luts_buffer<Torus, KSTorus> *bit_extract_luts_with_offset_2;
int_radix_lut<Torus, KSTorus> *mux_lut;
int_radix_lut<Torus, KSTorus> *cleaning_lut;
Torus offset;
bool gpu_memory_allocated;
@@ -46,19 +46,20 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
offset = (shift_type == LEFT_SHIFT ? 0 : total_nb_bits);
bit_extract_luts = new int_bit_extract_luts_buffer<Torus>(
bit_extract_luts = new int_bit_extract_luts_buffer<Torus, KSTorus>(
streams, params, bits_per_block, num_radix_blocks, allocate_gpu_memory,
size_tracker);
bit_extract_luts_with_offset_2 = new int_bit_extract_luts_buffer<Torus>(
streams, params, bits_per_block, 2, num_radix_blocks,
allocate_gpu_memory, size_tracker);
bit_extract_luts_with_offset_2 =
new int_bit_extract_luts_buffer<Torus, KSTorus>(
streams, params, bits_per_block, 2, num_radix_blocks,
allocate_gpu_memory, size_tracker);
mux_lut = new int_radix_lut<Torus>(streams, params, 1,
bits_per_block * num_radix_blocks,
allocate_gpu_memory, size_tracker);
mux_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, bits_per_block * num_radix_blocks,
allocate_gpu_memory, size_tracker);
cleaning_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
new int_radix_lut<Torus, KSTorus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
tmp_bits = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(

View File

@@ -1,13 +1,13 @@
#pragma once
#include "integer_utilities.h"
template <typename Torus> struct int_overflowing_sub_memory {
template <typename Torus, typename KSTorus> struct int_overflowing_sub_memory {
Torus *generates_or_propagates;
Torus *step_output;
int_radix_lut<Torus> *luts_array;
int_radix_lut<Torus> *luts_borrow_propagation_sum;
int_radix_lut<Torus> *message_acc;
int_radix_lut<Torus, KSTorus> *luts_array;
int_radix_lut<Torus, KSTorus> *luts_borrow_propagation_sum;
int_radix_lut<Torus, KSTorus> *message_acc;
int_radix_params params;
bool gpu_memory_allocated;
@@ -65,14 +65,15 @@ template <typename Torus> struct int_overflowing_sub_memory {
};
// create lut objects
luts_array = new int_radix_lut<Torus>(streams, params, 2, num_radix_blocks,
luts_array =
new int_radix_lut<Torus, KSTorus>(streams, params, 2, num_radix_blocks,
allocate_gpu_memory, size_tracker);
luts_borrow_propagation_sum = new int_radix_lut<Torus>(
luts_borrow_propagation_sum = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, luts_array, size_tracker,
allocate_gpu_memory, size_tracker);
message_acc = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, luts_array, size_tracker,
allocate_gpu_memory, size_tracker);
message_acc = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
luts_array, size_tracker,
allocate_gpu_memory, size_tracker);
auto lut_does_block_generate_carry = luts_array->get_lut(0, 0);
auto lut_does_block_generate_or_propagate = luts_array->get_lut(0, 1);
@@ -132,13 +133,13 @@ template <typename Torus> struct int_overflowing_sub_memory {
}
};
template <typename Torus> struct int_sub_and_propagate {
template <typename Torus, typename KSTorus> struct int_sub_and_propagate {
int_radix_params params;
bool allocate_gpu_memory;
CudaRadixCiphertextFFI *neg_rhs_array;
int_sc_prop_memory<Torus> *sc_prop_mem;
int_sc_prop_memory<Torus, KSTorus> *sc_prop_mem;
int_sub_and_propagate(CudaStreams streams, const int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
@@ -147,7 +148,7 @@ template <typename Torus> struct int_sub_and_propagate {
this->params = params;
this->allocate_gpu_memory = allocate_gpu_memory;
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
this->sc_prop_mem = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, requested_flag_in,
allocate_gpu_memory, size_tracker);

View File

@@ -6,13 +6,14 @@
const uint32_t MAX_STREAMS_FOR_VECTOR_COMPARISON = 8;
template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_all_eq_slices_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_comparison_buffer<Torus> **eq_buffers;
int_comparison_buffer<Torus> *reduction_buffer;
int_comparison_buffer<Torus, KSTorus> **eq_buffers;
int_comparison_buffer<Torus, KSTorus> *reduction_buffer;
CudaRadixCiphertextFFI *packed_results;
@@ -68,16 +69,16 @@ template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
}
}
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
this->eq_buffers = new int_comparison_buffer<Torus, KSTorus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
this->eq_buffers[i] = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
this->reduction_buffer =
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
allocate_gpu_memory, size_tracker);
this->reduction_buffer = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_inputs, false, allocate_gpu_memory,
size_tracker);
this->packed_results = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -133,14 +134,15 @@ template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
}
};
template <typename Torus> struct int_unchecked_contains_sub_slice_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_contains_sub_slice_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_windows;
int_unchecked_all_eq_slices_buffer<Torus> *all_eq_buffer;
int_unchecked_all_eq_slices_buffer<Torus, KSTorus> *all_eq_buffer;
CudaRadixCiphertextFFI *packed_results;
int_comparison_buffer<Torus> *final_reduction_buffer;
int_comparison_buffer<Torus, KSTorus> *final_reduction_buffer;
int_unchecked_contains_sub_slice_buffer(CudaStreams streams,
int_radix_params params,
@@ -152,9 +154,10 @@ template <typename Torus> struct int_unchecked_contains_sub_slice_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_windows = num_lhs - num_rhs + 1;
this->all_eq_buffer = new int_unchecked_all_eq_slices_buffer<Torus>(
streams, params, num_rhs, num_blocks, allocate_gpu_memory,
size_tracker);
this->all_eq_buffer =
new int_unchecked_all_eq_slices_buffer<Torus, KSTorus>(
streams, params, num_rhs, num_blocks, allocate_gpu_memory,
size_tracker);
this->packed_results = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -162,7 +165,7 @@ template <typename Torus> struct int_unchecked_contains_sub_slice_buffer {
this->num_windows, params.big_lwe_dimension, size_tracker,
allocate_gpu_memory);
this->final_reduction_buffer = new int_comparison_buffer<Torus>(
this->final_reduction_buffer = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, this->num_windows, false, allocate_gpu_memory,
size_tracker);
}

View File

@@ -9,13 +9,14 @@
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
template <typename Torus> struct int_equality_selectors_buffer {
template <typename Torus, typename KSTorus>
struct int_equality_selectors_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t lut_stride;
uint32_t num_possible_values;
int_radix_lut<Torus> *comparison_luts;
int_radix_lut<Torus, KSTorus> *comparison_luts;
CudaRadixCiphertextFFI *tmp_many_luts_output;
CudaStreams active_streams;
@@ -23,7 +24,7 @@ template <typename Torus> struct int_equality_selectors_buffer {
uint32_t num_streams;
CudaRadixCiphertextFFI **tmp_block_comparisons;
int_comparison_buffer<Torus> **reduction_buffers;
int_comparison_buffer<Torus, KSTorus> **reduction_buffers;
int_equality_selectors_buffer(CudaStreams streams, int_radix_params params,
uint32_t num_possible_values,
@@ -49,7 +50,7 @@ template <typename Torus> struct int_equality_selectors_buffer {
uint32_t box_size = params.polynomial_size / ciphertext_modulus;
lut_stride = (ciphertext_modulus / params.message_modulus) * box_size;
this->comparison_luts = new int_radix_lut<Torus>(
this->comparison_luts = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, params.message_modulus,
allocate_gpu_memory, size_tracker);
@@ -80,7 +81,7 @@ template <typename Torus> struct int_equality_selectors_buffer {
this->tmp_block_comparisons =
new CudaRadixCiphertextFFI *[this->num_streams];
this->reduction_buffers =
new int_comparison_buffer<Torus> *[this->num_streams];
new int_comparison_buffer<Torus, KSTorus> *[this->num_streams];
for (uint32_t j = 0; j < this->num_streams; j++) {
this->tmp_block_comparisons[j] = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -88,7 +89,7 @@ template <typename Torus> struct int_equality_selectors_buffer {
this->tmp_block_comparisons[j], num_blocks, params.big_lwe_dimension,
size_tracker, allocate_gpu_memory);
this->reduction_buffers[j] = new int_comparison_buffer<Torus>(
this->reduction_buffers[j] = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
@@ -123,7 +124,7 @@ template <typename Torus> struct int_equality_selectors_buffer {
}
};
template <typename Torus> struct int_possible_results_buffer {
template <typename Torus, typename KSTorus> struct int_possible_results_buffer {
int_radix_params params;
bool allocate_gpu_memory;
@@ -132,7 +133,7 @@ template <typename Torus> struct int_possible_results_buffer {
uint32_t num_lut_accumulators;
uint32_t lut_stride;
int_radix_lut<Torus> **stream_luts;
int_radix_lut<Torus, KSTorus> **stream_luts;
CudaStreams active_streams;
InternalCudaStreams internal_cuda_streams;
@@ -177,7 +178,7 @@ template <typename Torus> struct int_possible_results_buffer {
(total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
stream_luts =
new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
new int_radix_lut<Torus, KSTorus> *[num_streams * num_lut_accumulators];
std::vector<std::function<Torus(Torus)>> fns;
fns.reserve(max_luts_per_call);
@@ -191,9 +192,10 @@ template <typename Torus> struct int_possible_results_buffer {
uint32_t luts_in_this_call =
std::min(max_luts_per_call, total_luts_needed - lut_value_start);
int_radix_lut<Torus> *current_lut =
new int_radix_lut<Torus>(streams, params, 1, 1, luts_in_this_call,
allocate_gpu_memory, size_tracker);
int_radix_lut<Torus, KSTorus> *current_lut =
new int_radix_lut<Torus, KSTorus>(
streams, params, 1, 1, luts_in_this_call, allocate_gpu_memory,
size_tracker);
for (uint32_t j = 0; j < luts_in_this_call; j++) {
uint32_t c = lut_value_start + j;
@@ -246,14 +248,15 @@ template <typename Torus> struct int_possible_results_buffer {
}
};
template <typename Torus> struct int_aggregate_one_hot_buffer {
template <typename Torus, typename KSTorus>
struct int_aggregate_one_hot_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t chunk_size;
int_radix_lut<Torus> **stream_identity_luts;
int_radix_lut<Torus> *message_extract_lut;
int_radix_lut<Torus> *carry_extract_lut;
int_radix_lut<Torus, KSTorus> **stream_identity_luts;
int_radix_lut<Torus, KSTorus> *message_extract_lut;
int_radix_lut<Torus, KSTorus> *carry_extract_lut;
CudaStreams active_streams;
InternalCudaStreams internal_cuda_streams;
@@ -287,11 +290,12 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams);
this->stream_identity_luts = new int_radix_lut<Torus> *[num_streams];
this->stream_identity_luts =
new int_radix_lut<Torus, KSTorus> *[num_streams];
std::function<Torus(Torus)> id_fn = [](Torus x) -> Torus { return x; };
for (uint32_t i = 0; i < num_streams; i++) {
int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
int_radix_lut<Torus, KSTorus> *lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
@@ -311,7 +315,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
return x / params.message_modulus;
};
this->message_extract_lut = new int_radix_lut<Torus>(
this->message_extract_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
@@ -323,7 +327,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->message_extract_lut->broadcast_lut(
streams.active_gpu_subset(num_blocks));
this->carry_extract_lut = new int_radix_lut<Torus>(
this->carry_extract_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
@@ -404,7 +408,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
}
};
template <typename Torus> struct int_unchecked_match_buffer {
template <typename Torus, typename KSTorus> struct int_unchecked_match_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_matches;
@@ -412,10 +416,10 @@ template <typename Torus> struct int_unchecked_match_buffer {
uint32_t num_output_packed_blocks;
bool max_output_is_zero;
int_equality_selectors_buffer<Torus> *eq_selectors_buffer;
int_possible_results_buffer<Torus> *possible_results_buffer;
int_aggregate_one_hot_buffer<Torus> *aggregate_buffer;
int_comparison_buffer<Torus> *at_least_one_true_buffer;
int_equality_selectors_buffer<Torus, KSTorus> *eq_selectors_buffer;
int_possible_results_buffer<Torus, KSTorus> *possible_results_buffer;
int_aggregate_one_hot_buffer<Torus, KSTorus> *aggregate_buffer;
int_comparison_buffer<Torus, KSTorus> *at_least_one_true_buffer;
CudaRadixCiphertextFFI *selectors_list;
CudaRadixCiphertextFFI *packed_selectors_ct;
@@ -433,21 +437,23 @@ template <typename Torus> struct int_unchecked_match_buffer {
this->num_output_packed_blocks = num_output_packed_blocks;
this->max_output_is_zero = max_output_is_zero;
this->eq_selectors_buffer = new int_equality_selectors_buffer<Torus>(
streams, params, num_matches, num_input_blocks, allocate_gpu_memory,
size_tracker);
this->eq_selectors_buffer =
new int_equality_selectors_buffer<Torus, KSTorus>(
streams, params, num_matches, num_input_blocks, allocate_gpu_memory,
size_tracker);
this->possible_results_buffer = new int_possible_results_buffer<Torus>(
streams, params, num_output_packed_blocks, num_matches,
allocate_gpu_memory, size_tracker);
this->possible_results_buffer =
new int_possible_results_buffer<Torus, KSTorus>(
streams, params, num_output_packed_blocks, num_matches,
allocate_gpu_memory, size_tracker);
if (!max_output_is_zero) {
this->aggregate_buffer = new int_aggregate_one_hot_buffer<Torus>(
this->aggregate_buffer = new int_aggregate_one_hot_buffer<Torus, KSTorus>(
streams, params, num_output_packed_blocks, num_matches,
allocate_gpu_memory, size_tracker);
}
this->at_least_one_true_buffer = new int_comparison_buffer<Torus>(
this->at_least_one_true_buffer = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_matches, false, allocate_gpu_memory,
size_tracker);
@@ -511,7 +517,8 @@ template <typename Torus> struct int_unchecked_match_buffer {
}
};
template <typename Torus> struct int_unchecked_match_value_or_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_match_value_or_buffer {
int_radix_params params;
bool allocate_gpu_memory;
@@ -521,8 +528,8 @@ template <typename Torus> struct int_unchecked_match_value_or_buffer {
uint32_t num_final_blocks;
bool max_output_is_zero;
int_unchecked_match_buffer<Torus> *match_buffer;
int_cmux_buffer<Torus> *cmux_buffer;
int_unchecked_match_buffer<Torus, KSTorus> *match_buffer;
int_cmux_buffer<Torus, KSTorus> *cmux_buffer;
CudaRadixCiphertextFFI *tmp_match_result;
CudaRadixCiphertextFFI *tmp_match_bool;
@@ -543,11 +550,11 @@ template <typename Torus> struct int_unchecked_match_value_or_buffer {
this->num_final_blocks = num_final_blocks;
this->max_output_is_zero = max_output_is_zero;
this->match_buffer = new int_unchecked_match_buffer<Torus>(
this->match_buffer = new int_unchecked_match_buffer<Torus, KSTorus>(
streams, params, num_matches, num_input_blocks, num_match_packed_blocks,
max_output_is_zero, allocate_gpu_memory, size_tracker);
this->cmux_buffer = new int_cmux_buffer<Torus>(
this->cmux_buffer = new int_cmux_buffer<Torus, KSTorus>(
streams, [](Torus x) -> Torus { return x == 1; }, params,
num_final_blocks, allocate_gpu_memory, size_tracker);
@@ -600,13 +607,14 @@ template <typename Torus> struct int_unchecked_match_value_or_buffer {
}
};
template <typename Torus> struct int_unchecked_contains_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_contains_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_comparison_buffer<Torus> **eq_buffers;
int_comparison_buffer<Torus> *reduction_buffer;
int_comparison_buffer<Torus, KSTorus> **eq_buffers;
int_comparison_buffer<Torus, KSTorus> *reduction_buffer;
CudaRadixCiphertextFFI *packed_selectors;
@@ -633,16 +641,16 @@ template <typename Torus> struct int_unchecked_contains_buffer {
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
this->eq_buffers = new int_comparison_buffer<Torus, KSTorus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
this->eq_buffers[i] = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
this->reduction_buffer =
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
allocate_gpu_memory, size_tracker);
this->reduction_buffer = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_inputs, false, allocate_gpu_memory,
size_tracker);
this->packed_selectors = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -672,13 +680,14 @@ template <typename Torus> struct int_unchecked_contains_buffer {
}
};
template <typename Torus> struct int_unchecked_contains_clear_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_contains_clear_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_comparison_buffer<Torus> **eq_buffers;
int_comparison_buffer<Torus> *reduction_buffer;
int_comparison_buffer<Torus, KSTorus> **eq_buffers;
int_comparison_buffer<Torus, KSTorus> *reduction_buffer;
CudaRadixCiphertextFFI *packed_selectors;
CudaRadixCiphertextFFI *tmp_clear_val;
@@ -708,16 +717,16 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
this->eq_buffers = new int_comparison_buffer<Torus, KSTorus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
this->eq_buffers[i] = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
this->reduction_buffer =
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
allocate_gpu_memory, size_tracker);
this->reduction_buffer = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_inputs, false, allocate_gpu_memory,
size_tracker);
this->packed_selectors = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -764,13 +773,14 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
}
};
template <typename Torus> struct int_unchecked_is_in_clears_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_is_in_clears_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_clears;
int_equality_selectors_buffer<Torus> *eq_buffer;
int_comparison_buffer<Torus> *reduction_buffer;
int_equality_selectors_buffer<Torus, KSTorus> *eq_buffer;
int_comparison_buffer<Torus, KSTorus> *reduction_buffer;
CudaRadixCiphertextFFI *packed_selectors;
CudaRadixCiphertextFFI *unpacked_selectors;
@@ -784,13 +794,13 @@ template <typename Torus> struct int_unchecked_is_in_clears_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_clears = num_clears;
this->eq_buffer = new int_equality_selectors_buffer<Torus>(
this->eq_buffer = new int_equality_selectors_buffer<Torus, KSTorus>(
streams, params, num_clears, num_blocks, allocate_gpu_memory,
size_tracker);
this->reduction_buffer =
new int_comparison_buffer<Torus>(streams, EQ, params, num_clears, false,
allocate_gpu_memory, size_tracker);
this->reduction_buffer = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_clears, false, allocate_gpu_memory,
size_tracker);
this->packed_selectors = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
@@ -824,14 +834,15 @@ template <typename Torus> struct int_unchecked_is_in_clears_buffer {
}
};
template <typename Torus> struct int_final_index_from_selectors_buffer {
template <typename Torus, typename KSTorus>
struct int_final_index_from_selectors_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_possible_results_buffer<Torus> *possible_results_buf;
int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
int_comparison_buffer<Torus> *reduction_buf;
int_possible_results_buffer<Torus, KSTorus> *possible_results_buf;
int_aggregate_one_hot_buffer<Torus, KSTorus> *aggregate_buf;
int_comparison_buffer<Torus, KSTorus> *reduction_buf;
CudaRadixCiphertextFFI *packed_selectors;
CudaRadixCiphertextFFI *unpacked_selectors;
@@ -851,18 +862,19 @@ template <typename Torus> struct int_final_index_from_selectors_buffer {
uint32_t packed_len = (num_blocks_index + 1) / 2;
this->possible_results_buf = new int_possible_results_buffer<Torus>(
this->possible_results_buf =
new int_possible_results_buffer<Torus, KSTorus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
size_tracker);
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus, KSTorus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
size_tracker);
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
this->reduction_buf = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_inputs, false, allocate_gpu_memory,
size_tracker);
this->reduction_buf =
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
allocate_gpu_memory, size_tracker);
this->packed_selectors = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
@@ -927,13 +939,14 @@ template <typename Torus> struct int_final_index_from_selectors_buffer {
}
};
template <typename Torus> struct int_unchecked_index_in_clears_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_index_in_clears_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_clears;
int_equality_selectors_buffer<Torus> *eq_selectors_buf;
int_final_index_from_selectors_buffer<Torus> *final_index_buf;
int_equality_selectors_buffer<Torus, KSTorus> *eq_selectors_buf;
int_final_index_from_selectors_buffer<Torus, KSTorus> *final_index_buf;
int_unchecked_index_in_clears_buffer(CudaStreams streams,
int_radix_params params,
@@ -945,13 +958,14 @@ template <typename Torus> struct int_unchecked_index_in_clears_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_clears = num_clears;
this->eq_selectors_buf = new int_equality_selectors_buffer<Torus>(
this->eq_selectors_buf = new int_equality_selectors_buffer<Torus, KSTorus>(
streams, params, num_clears, num_blocks, allocate_gpu_memory,
size_tracker);
this->final_index_buf = new int_final_index_from_selectors_buffer<Torus>(
streams, params, num_clears, num_blocks_index, allocate_gpu_memory,
size_tracker);
this->final_index_buf =
new int_final_index_from_selectors_buffer<Torus, KSTorus>(
streams, params, num_clears, num_blocks_index, allocate_gpu_memory,
size_tracker);
}
void release(CudaStreams streams) {
@@ -965,15 +979,16 @@ template <typename Torus> struct int_unchecked_index_in_clears_buffer {
}
};
template <typename Torus> struct int_unchecked_first_index_in_clears_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_first_index_in_clears_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_unique;
int_equality_selectors_buffer<Torus> *eq_selectors_buf;
int_possible_results_buffer<Torus> *possible_results_buf;
int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
int_comparison_buffer<Torus> *reduction_buf;
int_equality_selectors_buffer<Torus, KSTorus> *eq_selectors_buf;
int_possible_results_buffer<Torus, KSTorus> *possible_results_buf;
int_aggregate_one_hot_buffer<Torus, KSTorus> *aggregate_buf;
int_comparison_buffer<Torus, KSTorus> *reduction_buf;
CudaRadixCiphertextFFI *packed_selectors;
CudaRadixCiphertextFFI *unpacked_selectors;
@@ -987,23 +1002,24 @@ template <typename Torus> struct int_unchecked_first_index_in_clears_buffer {
this->allocate_gpu_memory = allocate_gpu_memory;
this->num_unique = num_unique;
this->eq_selectors_buf = new int_equality_selectors_buffer<Torus>(
this->eq_selectors_buf = new int_equality_selectors_buffer<Torus, KSTorus>(
streams, params, num_unique, num_blocks, allocate_gpu_memory,
size_tracker);
uint32_t packed_len = (num_blocks_index + 1) / 2;
this->possible_results_buf = new int_possible_results_buffer<Torus>(
this->possible_results_buf =
new int_possible_results_buffer<Torus, KSTorus>(
streams, params, packed_len, num_unique, allocate_gpu_memory,
size_tracker);
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus, KSTorus>(
streams, params, packed_len, num_unique, allocate_gpu_memory,
size_tracker);
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
streams, params, packed_len, num_unique, allocate_gpu_memory,
this->reduction_buf = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_unique, false, allocate_gpu_memory,
size_tracker);
this->reduction_buf =
new int_comparison_buffer<Torus>(streams, EQ, params, num_unique, false,
allocate_gpu_memory, size_tracker);
this->packed_selectors = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
@@ -1056,15 +1072,16 @@ template <typename Torus> struct int_unchecked_first_index_in_clears_buffer {
}
};
template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_first_index_of_clear_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_comparison_buffer<Torus> **eq_buffers;
int_possible_results_buffer<Torus> *possible_results_buf;
int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
int_comparison_buffer<Torus> *reduction_buf;
int_comparison_buffer<Torus, KSTorus> **eq_buffers;
int_possible_results_buffer<Torus, KSTorus> *possible_results_buf;
int_aggregate_one_hot_buffer<Torus, KSTorus> *aggregate_buf;
int_comparison_buffer<Torus, KSTorus> *reduction_buf;
CudaRadixCiphertextFFI *packed_selectors;
CudaRadixCiphertextFFI *unpacked_selectors;
@@ -1073,8 +1090,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
Torus *d_clear_val;
uint64_t *h_indices;
int_radix_lut<Torus> *prefix_sum_lut;
int_radix_lut<Torus> *cleanup_lut;
int_radix_lut<Torus, KSTorus> *prefix_sum_lut;
int_radix_lut<Torus, KSTorus> *cleanup_lut;
CudaStreams active_streams;
InternalCudaStreams internal_cuda_streams;
@@ -1101,25 +1118,26 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
uint32_t packed_len = (num_blocks_index + 1) / 2;
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
this->eq_buffers = new int_comparison_buffer<Torus, KSTorus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
this->eq_buffers[i] = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
this->possible_results_buf = new int_possible_results_buffer<Torus>(
this->possible_results_buf =
new int_possible_results_buffer<Torus, KSTorus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
size_tracker);
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus, KSTorus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
size_tracker);
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
this->reduction_buf = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_inputs, false, allocate_gpu_memory,
size_tracker);
this->reduction_buf =
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
allocate_gpu_memory, size_tracker);
this->packed_selectors = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
@@ -1174,7 +1192,7 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
}
return current;
};
this->prefix_sum_lut = new int_radix_lut<Torus>(
this->prefix_sum_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
@@ -1192,7 +1210,7 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
return 0;
return val;
};
this->cleanup_lut = new int_radix_lut<Torus>(
this->cleanup_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
@@ -1254,23 +1272,24 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
}
};
template <typename Torus> struct int_unchecked_first_index_of_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_first_index_of_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_comparison_buffer<Torus> **eq_buffers;
int_possible_results_buffer<Torus> *possible_results_buf;
int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
int_comparison_buffer<Torus> *reduction_buf;
int_comparison_buffer<Torus, KSTorus> **eq_buffers;
int_possible_results_buffer<Torus, KSTorus> *possible_results_buf;
int_aggregate_one_hot_buffer<Torus, KSTorus> *aggregate_buf;
int_comparison_buffer<Torus, KSTorus> *reduction_buf;
CudaRadixCiphertextFFI *packed_selectors;
CudaRadixCiphertextFFI *unpacked_selectors;
CudaRadixCiphertextFFI *possible_results_ct_list;
uint64_t *h_indices;
int_radix_lut<Torus> *prefix_sum_lut;
int_radix_lut<Torus> *cleanup_lut;
int_radix_lut<Torus, KSTorus> *prefix_sum_lut;
int_radix_lut<Torus, KSTorus> *cleanup_lut;
CudaStreams active_streams;
InternalCudaStreams internal_cuda_streams;
@@ -1299,25 +1318,26 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
uint32_t packed_len = (num_blocks_index + 1) / 2;
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
this->eq_buffers = new int_comparison_buffer<Torus, KSTorus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
this->eq_buffers[i] = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
this->possible_results_buf = new int_possible_results_buffer<Torus>(
this->possible_results_buf =
new int_possible_results_buffer<Torus, KSTorus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
size_tracker);
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus, KSTorus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
size_tracker);
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
streams, params, packed_len, num_inputs, allocate_gpu_memory,
this->reduction_buf = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_inputs, false, allocate_gpu_memory,
size_tracker);
this->reduction_buf =
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
allocate_gpu_memory, size_tracker);
this->packed_selectors = new CudaRadixCiphertextFFI;
create_zero_radix_ciphertext_async<Torus>(
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
@@ -1362,7 +1382,7 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
}
return current;
};
this->prefix_sum_lut = new int_radix_lut<Torus>(
this->prefix_sum_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator_bivariate<Torus>(
@@ -1380,7 +1400,7 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
return 0;
return val;
};
this->cleanup_lut = new int_radix_lut<Torus>(
this->cleanup_lut = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
@@ -1435,13 +1455,14 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
}
};
template <typename Torus> struct int_unchecked_index_of_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_index_of_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_comparison_buffer<Torus> **eq_buffers;
int_final_index_from_selectors_buffer<Torus> *final_index_buf;
int_comparison_buffer<Torus, KSTorus> **eq_buffers;
int_final_index_from_selectors_buffer<Torus, KSTorus> *final_index_buf;
CudaStreams active_streams;
InternalCudaStreams internal_cuda_streams;
@@ -1467,16 +1488,17 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
this->eq_buffers = new int_comparison_buffer<Torus, KSTorus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
this->eq_buffers[i] = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
this->final_index_buf = new int_final_index_from_selectors_buffer<Torus>(
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
size_tracker);
this->final_index_buf =
new int_final_index_from_selectors_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
size_tracker);
}
void release(CudaStreams streams) {
@@ -1495,13 +1517,14 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
}
};
template <typename Torus> struct int_unchecked_index_of_clear_buffer {
template <typename Torus, typename KSTorus>
struct int_unchecked_index_of_clear_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t num_inputs;
int_comparison_buffer<Torus> **eq_buffers;
int_final_index_from_selectors_buffer<Torus> *final_index_buf;
int_comparison_buffer<Torus, KSTorus> **eq_buffers;
int_final_index_from_selectors_buffer<Torus, KSTorus> *final_index_buf;
CudaStreams active_streams;
InternalCudaStreams internal_cuda_streams;
@@ -1528,16 +1551,17 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
this->eq_buffers = new int_comparison_buffer<Torus, KSTorus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
this->eq_buffers[i] = new int_comparison_buffer<Torus, KSTorus>(
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
this->final_index_buf = new int_final_index_from_selectors_buffer<Torus>(
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
size_tracker);
this->final_index_buf =
new int_final_index_from_selectors_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
size_tracker);
}
void release(CudaStreams streams) {

View File

@@ -31,7 +31,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t num_many_lut, uint32_t lut_stride);
#endif
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,

View File

@@ -469,7 +469,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_programmable_bootstrap_tbc(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,

View File

@@ -95,14 +95,14 @@ template <typename Torus> struct expand_job {
////////////////////////////////////
template <typename Torus> struct zk_expand_mem {
template <typename Torus, typename KSTorus> struct zk_expand_mem {
int_radix_params computing_params;
int_radix_params casting_params;
bool casting_key_type;
uint32_t num_lwes;
uint32_t num_compact_lists;
int_radix_lut<Torus> *message_and_carry_extract_luts;
int_radix_lut<Torus, KSTorus> *message_and_carry_extract_luts;
Torus *tmp_expanded_lwes;
Torus *tmp_ksed_small_to_big_expanded_lwes;
@@ -170,7 +170,7 @@ template <typename Torus> struct zk_expand_mem {
if (casting_key_type == SMALL_TO_BIG) {
params = computing_params;
}
message_and_carry_extract_luts = new int_radix_lut<Torus>(
message_and_carry_extract_luts = new int_radix_lut<Torus, KSTorus>(
streams, params, 4, 2 * num_lwes, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(

View File

@@ -16,9 +16,10 @@ uint64_t scratch_cuda_integer_aes_encrypt_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_integer_aes_encrypt<uint64_t>(
CudaStreams(streams), (int_aes_encrypt_buffer<uint64_t> **)mem_ptr,
params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism);
return scratch_cuda_integer_aes_encrypt<uint64_t, uint64_t>(
CudaStreams(streams),
(int_aes_encrypt_buffer<uint64_t, uint64_t> **)mem_ptr, params,
allocate_gpu_memory, num_aes_inputs, sbox_parallelism);
}
void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
@@ -29,17 +30,17 @@ void cuda_integer_aes_ctr_encrypt_64(CudaStreamsFFI streams,
uint32_t num_aes_inputs, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {
host_integer_aes_ctr_encrypt<uint64_t>(
host_integer_aes_ctr_encrypt<uint64_t, uint64_t>(
CudaStreams(streams), output, iv, round_keys, counter_bits_le_all_blocks,
num_aes_inputs, (int_aes_encrypt_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks);
num_aes_inputs, (int_aes_encrypt_buffer<uint64_t, uint64_t> *)mem_ptr,
bsks, (uint64_t **)ksks);
}
void cleanup_cuda_integer_aes_encrypt_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_aes_encrypt_buffer<uint64_t> *mem_ptr =
(int_aes_encrypt_buffer<uint64_t> *)(*mem_ptr_void);
int_aes_encrypt_buffer<uint64_t, uint64_t> *mem_ptr =
(int_aes_encrypt_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -61,9 +62,10 @@ uint64_t scratch_cuda_integer_key_expansion_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_integer_key_expansion<uint64_t>(
CudaStreams(streams), (int_key_expansion_buffer<uint64_t> **)mem_ptr,
params, allocate_gpu_memory);
return scratch_cuda_integer_key_expansion<uint64_t, uint64_t>(
CudaStreams(streams),
(int_key_expansion_buffer<uint64_t, uint64_t> **)mem_ptr, params,
allocate_gpu_memory);
}
void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
@@ -72,15 +74,16 @@ void cuda_integer_key_expansion_64(CudaStreamsFFI streams,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
host_integer_key_expansion<uint64_t>(
host_integer_key_expansion<uint64_t, uint64_t>(
CudaStreams(streams), expanded_keys, key,
(int_key_expansion_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
(int_key_expansion_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks);
}
void cleanup_cuda_integer_key_expansion_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_key_expansion_buffer<uint64_t> *mem_ptr =
(int_key_expansion_buffer<uint64_t> *)(*mem_ptr_void);
int_key_expansion_buffer<uint64_t, uint64_t> *mem_ptr =
(int_key_expansion_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -7,14 +7,14 @@
#include "../integer/scalar_addition.cuh"
#include "../linearalgebra/addition.cuh"
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_integer_aes_encrypt(
CudaStreams streams, int_aes_encrypt_buffer<Torus> **mem_ptr,
CudaStreams streams, int_aes_encrypt_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, bool allocate_gpu_memory, uint32_t num_aes_inputs,
uint32_t sbox_parallelism) {
uint64_t size_tracker = 0;
*mem_ptr = new int_aes_encrypt_buffer<Torus>(
*mem_ptr = new int_aes_encrypt_buffer<Torus, KSTorus>(
streams, params, allocate_gpu_memory, num_aes_inputs, sbox_parallelism,
size_tracker);
return size_tracker;
@@ -86,9 +86,9 @@ transpose_bitsliced_to_blocks(cudaStream_t stream, uint32_t gpu_index,
* Performs a vectorized homomorphic XOR operation on two sets of ciphertexts.
*
*/
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ __forceinline__ void
aes_xor(CudaStreams streams, int_aes_encrypt_buffer<Torus> *mem,
aes_xor(CudaStreams streams, int_aes_encrypt_buffer<Torus, KSTorus> *mem,
CudaRadixCiphertextFFI *out, const CudaRadixCiphertextFFI *lhs,
const CudaRadixCiphertextFFI *rhs) {
@@ -108,8 +108,8 @@ aes_xor(CudaStreams streams, int_aes_encrypt_buffer<Torus> *mem,
template <typename Torus, typename KSTorus>
__host__ __forceinline__ void
aes_flush_inplace(CudaStreams streams, CudaRadixCiphertextFFI *data,
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks,
KSTorus *const *ksks) {
int_aes_encrypt_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
integer_radix_apply_univariate_lookup_table<Torus>(streams, data, data, bsks,
ksks, mem->luts->flush_lut,
@@ -125,7 +125,7 @@ template <typename Torus, typename KSTorus>
__host__ __forceinline__ void
aes_scalar_add_one_flush_inplace(CudaStreams streams,
CudaRadixCiphertextFFI *data,
int_aes_encrypt_buffer<Torus> *mem,
int_aes_encrypt_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
host_add_scalar_one_inplace<Torus>(streams, data, mem->params.message_modulus,
@@ -147,7 +147,8 @@ aes_scalar_add_one_flush_inplace(CudaStreams streams,
template <typename Torus, typename KSTorus>
__host__ void
batch_vec_flush_inplace(CudaStreams streams, CudaRadixCiphertextFFI **targets,
size_t count, int_aes_encrypt_buffer<Torus> *mem,
size_t count,
int_aes_encrypt_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
uint32_t num_radix_blocks = targets[0]->num_radix_blocks;
@@ -192,7 +193,7 @@ __host__ void batch_vec_and_inplace(CudaStreams streams,
CudaRadixCiphertextFFI **outs,
CudaRadixCiphertextFFI **lhs,
CudaRadixCiphertextFFI **rhs, size_t count,
int_aes_encrypt_buffer<Torus> *mem,
int_aes_encrypt_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
uint32_t num_aes_inputs = outs[0]->num_radix_blocks;
@@ -277,12 +278,12 @@ __host__ void batch_vec_and_inplace(CudaStreams streams,
* ...
*/
template <typename Torus, typename KSTorus>
__host__ void vectorized_sbox_n_bytes(CudaStreams streams,
CudaRadixCiphertextFFI **sbox_io_bytes,
uint32_t num_bytes_parallel,
uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus> *mem,
void *const *bsks, KSTorus *const *ksks) {
__host__ void
vectorized_sbox_n_bytes(CudaStreams streams,
CudaRadixCiphertextFFI **sbox_io_bytes,
uint32_t num_bytes_parallel, uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
uint32_t num_sbox_blocks = num_bytes_parallel * num_aes_inputs;
@@ -618,11 +619,10 @@ __host__ void vectorized_sbox_n_bytes(CudaStreams streams,
*
*
*/
template <typename Torus>
__host__ void vectorized_shift_rows(CudaStreams streams,
CudaRadixCiphertextFFI *state_bitsliced,
uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus> *mem) {
template <typename Torus, typename KSTorus>
__host__ void vectorized_shift_rows(
CudaStreams streams, CudaRadixCiphertextFFI *state_bitsliced,
uint32_t num_aes_inputs, int_aes_encrypt_buffer<Torus, KSTorus> *mem) {
constexpr uint32_t NUM_BYTES = 16;
constexpr uint32_t LEN_BYTE = 8;
constexpr uint32_t NUM_BITS = NUM_BYTES * LEN_BYTE;
@@ -669,11 +669,11 @@ __host__ void vectorized_shift_rows(CudaStreams streams,
* Helper for MixColumns. Homomorphically multiplies an 8-bit byte by 2.
*
*/
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void vectorized_mul_by_2(CudaStreams streams,
CudaRadixCiphertextFFI *res_byte,
CudaRadixCiphertextFFI *in_byte,
int_aes_encrypt_buffer<Torus> *mem) {
int_aes_encrypt_buffer<Torus, KSTorus> *mem) {
constexpr uint32_t LEN_BYTE = 8;
@@ -705,11 +705,11 @@ __host__ void vectorized_mul_by_2(CudaStreams streams,
*
*/
template <typename Torus, typename KSTorus>
__host__ void vectorized_mix_columns(CudaStreams streams,
CudaRadixCiphertextFFI *s_bits,
uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus> *mem,
void *const *bsks, KSTorus *const *ksks) {
__host__ void
vectorized_mix_columns(CudaStreams streams, CudaRadixCiphertextFFI *s_bits,
uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
constexpr uint32_t BITS_PER_BYTE = 8;
constexpr uint32_t BYTES_PER_COLUMN = 4;
@@ -848,7 +848,7 @@ template <typename Torus, typename KSTorus>
__host__ void vectorized_aes_encrypt_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *all_states_bitsliced,
CudaRadixCiphertextFFI const *round_keys, uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks,
int_aes_encrypt_buffer<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks) {
constexpr uint32_t BITS_PER_BYTE = 8;
@@ -994,7 +994,7 @@ template <typename Torus, typename KSTorus>
__host__ void vectorized_aes_full_adder_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *transposed_states,
const Torus *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks,
int_aes_encrypt_buffer<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks) {
constexpr uint32_t NUM_BITS = 128;
@@ -1100,7 +1100,7 @@ __host__ void host_integer_aes_ctr_encrypt(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *iv, CudaRadixCiphertextFFI const *round_keys,
const Torus *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks,
int_aes_encrypt_buffer<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks) {
constexpr uint32_t NUM_BITS = 128;
@@ -1134,13 +1134,13 @@ __host__ void host_integer_aes_ctr_encrypt(
num_aes_inputs, NUM_BITS);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_integer_key_expansion(
CudaStreams streams, int_key_expansion_buffer<Torus> **mem_ptr,
CudaStreams streams, int_key_expansion_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_key_expansion_buffer<Torus>(
*mem_ptr = new int_key_expansion_buffer<Torus, KSTorus>(
streams, params, allocate_gpu_memory, size_tracker);
return size_tracker;
}
@@ -1154,12 +1154,12 @@ uint64_t scratch_cuda_integer_key_expansion(
* - If (i % 4 != 0): w_i = w_{i-4} + w_{i-1}
*/
template <typename Torus, typename KSTorus>
__host__ void host_integer_key_expansion(CudaStreams streams,
CudaRadixCiphertextFFI *expanded_keys,
CudaRadixCiphertextFFI const *key,
int_key_expansion_buffer<Torus> *mem,
void *const *bsks,
KSTorus *const *ksks) {
__host__ void
host_integer_key_expansion(CudaStreams streams,
CudaRadixCiphertextFFI *expanded_keys,
CudaRadixCiphertextFFI const *key,
int_key_expansion_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
constexpr uint32_t BITS_PER_WORD = 32;
constexpr uint32_t BITS_PER_BYTE = 8;

View File

@@ -9,8 +9,8 @@ void cuda_integer_aes_ctr_256_encrypt_64(
host_integer_aes_ctr_256_encrypt<uint64_t>(
CudaStreams(streams), output, iv, round_keys, counter_bits_le_all_blocks,
num_aes_inputs, (int_aes_encrypt_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks);
num_aes_inputs, (int_aes_encrypt_buffer<uint64_t, uint64_t> *)mem_ptr,
bsks, (uint64_t **)ksks);
}
uint64_t scratch_cuda_integer_key_expansion_256_64(
@@ -28,8 +28,9 @@ uint64_t scratch_cuda_integer_key_expansion_256_64(
noise_reduction_type);
return scratch_cuda_integer_key_expansion_256<uint64_t>(
CudaStreams(streams), (int_key_expansion_256_buffer<uint64_t> **)mem_ptr,
params, allocate_gpu_memory);
CudaStreams(streams),
(int_key_expansion_256_buffer<uint64_t, uint64_t> **)mem_ptr, params,
allocate_gpu_memory);
}
void cuda_integer_key_expansion_256_64(CudaStreamsFFI streams,
@@ -38,16 +39,16 @@ void cuda_integer_key_expansion_256_64(CudaStreamsFFI streams,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
host_integer_key_expansion_256<uint64_t>(
host_integer_key_expansion_256<uint64_t, uint64_t>(
CudaStreams(streams), expanded_keys, key,
(int_key_expansion_256_buffer<uint64_t> *)mem_ptr, bsks,
(int_key_expansion_256_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks);
}
void cleanup_cuda_integer_key_expansion_256_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_key_expansion_256_buffer<uint64_t> *mem_ptr =
(int_key_expansion_256_buffer<uint64_t> *)(*mem_ptr_void);
int_key_expansion_256_buffer<uint64_t, uint64_t> *mem_ptr =
(int_key_expansion_256_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -29,11 +29,12 @@
* - AddRoundKey
*
*/
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void vectorized_aes_256_encrypt_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *all_states_bitsliced,
CudaRadixCiphertextFFI const *round_keys, uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks, Torus *const *ksks) {
int_aes_encrypt_buffer<Torus, KSTorus> *mem, void *const *bsks,
Torus *const *ksks) {
constexpr uint32_t BITS_PER_BYTE = 8;
constexpr uint32_t STATE_BYTES = 16;
@@ -179,12 +180,13 @@ __host__ void vectorized_aes_256_encrypt_inplace(
* +---------------------------------+
*
*/
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_integer_aes_ctr_256_encrypt(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *iv, CudaRadixCiphertextFFI const *round_keys,
const Torus *counter_bits_le_all_blocks, uint32_t num_aes_inputs,
int_aes_encrypt_buffer<Torus> *mem, void *const *bsks, Torus *const *ksks) {
int_aes_encrypt_buffer<Torus, KSTorus> *mem, void *const *bsks,
Torus *const *ksks) {
constexpr uint32_t NUM_BITS = 128;
@@ -217,13 +219,13 @@ __host__ void host_integer_aes_ctr_256_encrypt(
num_aes_inputs, NUM_BITS);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_integer_key_expansion_256(
CudaStreams streams, int_key_expansion_256_buffer<Torus> **mem_ptr,
CudaStreams streams, int_key_expansion_256_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_key_expansion_256_buffer<Torus>(
*mem_ptr = new int_key_expansion_256_buffer<Torus, KSTorus>(
streams, params, allocate_gpu_memory, size_tracker);
return size_tracker;
}
@@ -238,11 +240,12 @@ uint64_t scratch_cuda_integer_key_expansion_256(
* - If (i % 8 == 4): w_i = w_{i-8} + SubWord(w_{i-1})
* - Otherwise: w_i = w_{i-8} + w_{i-1}
*/
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_integer_key_expansion_256(
CudaStreams streams, CudaRadixCiphertextFFI *expanded_keys,
CudaRadixCiphertextFFI const *key, int_key_expansion_256_buffer<Torus> *mem,
void *const *bsks, Torus *const *ksks) {
CudaRadixCiphertextFFI const *key,
int_key_expansion_256_buffer<Torus, KSTorus> *mem, void *const *bsks,
Torus *const *ksks) {
constexpr uint32_t BITS_PER_WORD = 32;
constexpr uint32_t BITS_PER_BYTE = 8;

View File

@@ -15,15 +15,15 @@ uint64_t scratch_cuda_integer_abs_inplace_radix_ciphertext_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_abs<uint64_t>(
CudaStreams(streams), (int_abs_buffer<uint64_t> **)mem_ptr, is_signed,
num_blocks, params, allocate_gpu_memory);
CudaStreams(streams), (int_abs_buffer<uint64_t, uint64_t> **)mem_ptr,
is_signed, num_blocks, params, allocate_gpu_memory);
}
void cuda_integer_abs_inplace_radix_ciphertext_64(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *ct, int8_t *mem_ptr,
bool is_signed, void *const *bsks, void *const *ksks) {
auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;
auto mem = (int_abs_buffer<uint64_t, uint64_t> *)mem_ptr;
host_integer_abs<uint64_t>(CudaStreams(streams), ct, bsks,
(uint64_t **)(ksks), mem, is_signed);
@@ -31,8 +31,8 @@ void cuda_integer_abs_inplace_radix_ciphertext_64(
void cleanup_cuda_integer_abs_inplace(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_abs_buffer<uint64_t> *mem_ptr =
(int_abs_buffer<uint64_t> *)(*mem_ptr_void);
int_abs_buffer<uint64_t, uint64_t> *mem_ptr =
(int_abs_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -9,17 +9,16 @@
#include "integer/scalar_shifts.cuh"
#include "radix_ciphertext.cuh"
template <typename Torus>
__host__ uint64_t scratch_cuda_integer_abs(CudaStreams streams,
int_abs_buffer<Torus> **mem_ptr,
bool is_signed, uint32_t num_blocks,
int_radix_params params,
bool allocate_gpu_memory) {
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_integer_abs(
CudaStreams streams, int_abs_buffer<Torus, KSTorus> **mem_ptr,
bool is_signed, uint32_t num_blocks, int_radix_params params,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
if (is_signed) {
*mem_ptr = new int_abs_buffer<Torus>(streams, params, num_blocks,
allocate_gpu_memory, size_tracker);
*mem_ptr = new int_abs_buffer<Torus, KSTorus>(
streams, params, num_blocks, allocate_gpu_memory, size_tracker);
}
return size_tracker;
}
@@ -27,7 +26,7 @@ __host__ uint64_t scratch_cuda_integer_abs(CudaStreams streams,
template <typename Torus>
__host__ void host_integer_abs(CudaStreams streams, CudaRadixCiphertextFFI *ct,
void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr,
int_abs_buffer<uint64_t, uint64_t> *mem_ptr,
bool is_signed) {
if (!is_signed)
return;

View File

@@ -15,8 +15,9 @@ uint64_t scratch_cuda_boolean_bitop_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_boolean_bitop<uint64_t>(
CudaStreams(streams), (boolean_bitop_buffer<uint64_t> **)mem_ptr,
return scratch_cuda_boolean_bitop<uint64_t, uint64_t>(
CudaStreams(streams),
(boolean_bitop_buffer<uint64_t, uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, op_type, is_unchecked, allocate_gpu_memory);
}
@@ -29,13 +30,14 @@ void cuda_boolean_bitop_ciphertext_64(CudaStreamsFFI streams,
host_boolean_bitop<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_1, lwe_array_2,
(boolean_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
(boolean_bitop_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_boolean_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
boolean_bitop_buffer<uint64_t> *mem_ptr =
(boolean_bitop_buffer<uint64_t> *)(*mem_ptr_void);
boolean_bitop_buffer<uint64_t, uint64_t> *mem_ptr =
(boolean_bitop_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -55,8 +57,9 @@ uint64_t scratch_cuda_boolean_bitnot_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_boolean_bitnot<uint64_t>(
CudaStreams(streams), (boolean_bitnot_buffer<uint64_t> **)mem_ptr, params,
return scratch_cuda_boolean_bitnot<uint64_t, uint64_t>(
CudaStreams(streams),
(boolean_bitnot_buffer<uint64_t, uint64_t> **)mem_ptr, params,
lwe_ciphertext_count, is_unchecked, allocate_gpu_memory);
}
@@ -64,16 +67,17 @@ void cuda_boolean_bitnot_ciphertext_64(CudaStreamsFFI streams,
CudaRadixCiphertextFFI *lwe_array,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
host_boolean_bitnot<uint64_t>(CudaStreams(streams), lwe_array,
(boolean_bitnot_buffer<uint64_t> *)mem_ptr,
bsks, (uint64_t **)(ksks));
host_boolean_bitnot<uint64_t>(
CudaStreams(streams), lwe_array,
(boolean_bitnot_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_boolean_bitnot(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
boolean_bitnot_buffer<uint64_t> *mem_ptr =
(boolean_bitnot_buffer<uint64_t> *)(*mem_ptr_void);
boolean_bitnot_buffer<uint64_t, uint64_t> *mem_ptr =
(boolean_bitnot_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -93,8 +97,8 @@ uint64_t scratch_cuda_bitop_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_bitop<uint64_t>(
CudaStreams(streams), (int_bitop_buffer<uint64_t> **)mem_ptr,
return scratch_cuda_bitop<uint64_t, uint64_t>(
CudaStreams(streams), (int_bitop_buffer<uint64_t, uint64_t> **)mem_ptr,
lwe_ciphertext_count, params, op_type, allocate_gpu_memory);
}
@@ -117,14 +121,15 @@ void cuda_bitop_ciphertext_64(CudaStreamsFFI streams,
void *const *ksks) {
host_bitop<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_array_1,
lwe_array_2, (int_bitop_buffer<uint64_t> *)mem_ptr, bsks,
lwe_array_2,
(int_bitop_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_integer_bitop(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_bitop_buffer<uint64_t> *mem_ptr =
(int_bitop_buffer<uint64_t> *)(*mem_ptr_void);
int_bitop_buffer<uint64_t, uint64_t> *mem_ptr =
(int_bitop_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -9,16 +9,16 @@
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_boolean_bitop(
CudaStreams streams, boolean_bitop_buffer<Torus> **mem_ptr,
CudaStreams streams, boolean_bitop_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, BITOP_TYPE op,
bool is_unchecked, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new boolean_bitop_buffer<Torus>(streams, op, is_unchecked, params,
num_radix_blocks,
allocate_gpu_memory, size_tracker);
*mem_ptr = new boolean_bitop_buffer<Torus, KSTorus>(
streams, op, is_unchecked, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
@@ -27,7 +27,7 @@ __host__ void host_boolean_bitop(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
boolean_bitop_buffer<Torus> *mem_ptr,
boolean_bitop_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks) {
PANIC_IF_FALSE(
@@ -190,24 +190,24 @@ host_bitnot(CudaStreams streams, CudaRadixCiphertextFFI *radix_ciphertext,
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_boolean_bitnot(
CudaStreams streams, boolean_bitnot_buffer<Torus> **mem_ptr,
CudaStreams streams, boolean_bitnot_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t lwe_ciphertext_count, bool is_unchecked,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new boolean_bitnot_buffer<Torus>(
*mem_ptr = new boolean_bitnot_buffer<Torus, KSTorus>(
streams, params, lwe_ciphertext_count, is_unchecked, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus, typename KSTorus>
__host__ void host_boolean_bitnot(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array,
boolean_bitnot_buffer<Torus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks) {
__host__ void
host_boolean_bitnot(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
boolean_bitnot_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks) {
bool carries_empty = true;
for (size_t i = 0; i < lwe_array->num_radix_blocks; ++i) {
if (lwe_array->degrees[i] >= mem_ptr->params.message_modulus) {
@@ -233,8 +233,8 @@ __host__ void host_bitop(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int_bitop_buffer<Torus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks) {
int_bitop_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks) {
PANIC_IF_FALSE(
lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
@@ -269,16 +269,16 @@ __host__ void host_bitop(CudaStreams streams,
lwe_array_out->num_radix_blocks * sizeof(uint64_t));
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_bitop(CudaStreams streams,
int_bitop_buffer<Torus> **mem_ptr,
int_bitop_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks,
int_radix_params params, BITOP_TYPE op,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_bitop_buffer<Torus>(streams, op, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
*mem_ptr = new int_bitop_buffer<Torus, KSTorus>(
streams, op, params, num_radix_blocks, allocate_gpu_memory, size_tracker);
return size_tracker;
}

View File

@@ -42,9 +42,10 @@ uint64_t scratch_cuda_cast_to_unsigned_64(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_cast_to_unsigned<uint64_t>(
CudaStreams(streams), (int_cast_to_unsigned_buffer<uint64_t> **)mem_ptr,
params, num_input_blocks, target_num_blocks, input_is_signed,
return scratch_cuda_cast_to_unsigned<uint64_t, uint64_t>(
CudaStreams(streams),
(int_cast_to_unsigned_buffer<uint64_t, uint64_t> **)mem_ptr, params,
num_input_blocks, target_num_blocks, input_is_signed,
requires_full_propagate, allocate_gpu_memory);
}
@@ -54,16 +55,16 @@ void cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
uint32_t target_num_blocks, bool input_is_signed,
void *const *bsks, void *const *ksks) {
host_cast_to_unsigned<uint64_t>(
host_cast_to_unsigned<uint64_t, uint64_t>(
CudaStreams(streams), output, input,
(int_cast_to_unsigned_buffer<uint64_t> *)mem_ptr, target_num_blocks,
input_is_signed, bsks, (uint64_t **)ksks);
(int_cast_to_unsigned_buffer<uint64_t, uint64_t> *)mem_ptr,
target_num_blocks, input_is_signed, bsks, (uint64_t **)ksks);
}
void cleanup_cuda_cast_to_unsigned_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_cast_to_unsigned_buffer<uint64_t> *mem_ptr =
(int_cast_to_unsigned_buffer<uint64_t> *)(*mem_ptr_void);
int_cast_to_unsigned_buffer<uint64_t, uint64_t> *mem_ptr =
(int_cast_to_unsigned_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
@@ -85,9 +86,10 @@ uint64_t scratch_cuda_cast_to_signed_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_cast_to_signed<uint64_t>(
CudaStreams(streams), (int_cast_to_signed_buffer<uint64_t> **)mem_ptr,
params, num_input_blocks, target_num_blocks, input_is_signed,
return scratch_cuda_cast_to_signed<uint64_t, uint64_t>(
CudaStreams(streams),
(int_cast_to_signed_buffer<uint64_t, uint64_t> **)mem_ptr, params,
num_input_blocks, target_num_blocks, input_is_signed,
allocate_gpu_memory);
}
@@ -97,15 +99,16 @@ void cuda_cast_to_signed_64(CudaStreamsFFI streams,
bool input_is_signed, void *const *bsks,
void *const *ksks) {
host_cast_to_signed<uint64_t>(CudaStreams(streams), output, input,
(int_cast_to_signed_buffer<uint64_t> *)mem,
input_is_signed, bsks, (uint64_t **)ksks);
host_cast_to_signed<uint64_t>(
CudaStreams(streams), output, input,
(int_cast_to_signed_buffer<uint64_t, uint64_t> *)mem, input_is_signed,
bsks, (uint64_t **)ksks);
}
void cleanup_cuda_cast_to_signed_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_cast_to_signed_buffer<uint64_t> *mem_ptr =
(int_cast_to_signed_buffer<uint64_t> *)(*mem_ptr_void);
int_cast_to_signed_buffer<uint64_t, uint64_t> *mem_ptr =
(int_cast_to_signed_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));

View File

@@ -53,15 +53,16 @@ host_trim_radix_blocks_msb(CudaRadixCiphertextFFI *output_radix,
output_radix->num_radix_blocks);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_extend_radix_with_sign_msb(
CudaStreams streams, int_extend_radix_with_sign_msb_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_extend_radix_with_sign_msb_buffer<Torus, KSTorus> **mem_ptr,
const int_radix_params params, uint32_t num_radix_blocks,
uint32_t num_additional_blocks, const bool allocate_gpu_memory) {
PUSH_RANGE("scratch cast/extend")
uint64_t size_tracker = 0;
*mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus>(
*mem_ptr = new int_extend_radix_with_sign_msb_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, num_additional_blocks,
allocate_gpu_memory, size_tracker);
POP_RANGE()
@@ -72,7 +73,7 @@ template <typename Torus, typename KSTorus>
__host__ void host_extend_radix_with_sign_msb(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int_extend_radix_with_sign_msb_buffer<Torus> *mem_ptr,
int_extend_radix_with_sign_msb_buffer<Torus, KSTorus> *mem_ptr,
uint32_t num_additional_blocks, void *const *bsks, KSTorus *const *ksks) {
if (num_additional_blocks == 0) {
@@ -108,26 +109,26 @@ __host__ void host_extend_radix_with_sign_msb(
POP_RANGE()
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_cast_to_unsigned(
CudaStreams streams, int_cast_to_unsigned_buffer<Torus> **mem_ptr,
CudaStreams streams, int_cast_to_unsigned_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_input_blocks,
uint32_t target_num_blocks, bool input_is_signed,
bool requires_full_propagate, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_cast_to_unsigned_buffer<Torus>(
*mem_ptr = new int_cast_to_unsigned_buffer<Torus, KSTorus>(
streams, params, num_input_blocks, target_num_blocks, input_is_signed,
requires_full_propagate, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void
host_cast_to_unsigned(CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI *input,
int_cast_to_unsigned_buffer<Torus> *mem_ptr,
int_cast_to_unsigned_buffer<Torus, KSTorus> *mem_ptr,
uint32_t target_num_blocks, bool input_is_signed,
void *const *bsks, Torus *const *ksks) {
@@ -160,27 +161,27 @@ host_cast_to_unsigned(CudaStreams streams, CudaRadixCiphertextFFI *output,
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t
scratch_cuda_cast_to_signed(CudaStreams streams,
int_cast_to_signed_buffer<Torus> **mem_ptr,
int_cast_to_signed_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_input_blocks,
uint32_t target_num_blocks, bool input_is_signed,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_cast_to_signed_buffer<Torus>(
*mem_ptr = new int_cast_to_signed_buffer<Torus, KSTorus>(
streams, params, num_input_blocks, target_num_blocks, input_is_signed,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void
host_cast_to_signed(CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int_cast_to_signed_buffer<Torus> *mem_ptr,
int_cast_to_signed_buffer<Torus, KSTorus> *mem_ptr,
bool input_is_signed, void *const *bsks, Torus **ksks) {
uint32_t current_num_blocks = input->num_radix_blocks;

View File

@@ -19,8 +19,8 @@ uint64_t scratch_cuda_cmux_64(CudaStreamsFFI streams, int8_t **mem_ptr,
std::function<uint64_t(uint64_t)> predicate_lut_f =
[](uint64_t x) -> uint64_t { return x == 1; };
uint64_t ret = scratch_cuda_cmux<uint64_t>(
CudaStreams(streams), (int_cmux_buffer<uint64_t> **)mem_ptr,
uint64_t ret = scratch_cuda_cmux<uint64_t, uint64_t>(
CudaStreams(streams), (int_cmux_buffer<uint64_t, uint64_t> **)mem_ptr,
predicate_lut_f, lwe_ciphertext_count, params, allocate_gpu_memory);
POP_RANGE()
return ret;
@@ -36,15 +36,15 @@ void cuda_cmux_ciphertext_64(CudaStreamsFFI streams,
PUSH_RANGE("cmux")
host_cmux<uint64_t>(CudaStreams(streams), lwe_array_out, lwe_condition,
lwe_array_true, lwe_array_false,
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks,
(int_cmux_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
POP_RANGE()
}
void cleanup_cuda_cmux(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup cmux")
int_cmux_buffer<uint64_t> *mem_ptr =
(int_cmux_buffer<uint64_t> *)(*mem_ptr_void);
int_cmux_buffer<uint64_t, uint64_t> *mem_ptr =
(int_cmux_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -6,13 +6,13 @@
#include "radix_ciphertext.cuh"
template <typename Torus, typename KSTorus>
__host__ void zero_out_if(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input,
CudaRadixCiphertextFFI const *lwe_condition,
int_zero_out_if_buffer<Torus> *mem_ptr,
int_radix_lut<Torus> *predicate, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
__host__ void
zero_out_if(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_input,
CudaRadixCiphertextFFI const *lwe_condition,
int_zero_out_if_buffer<Torus, KSTorus> *mem_ptr,
int_radix_lut<Torus, KSTorus> *predicate, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
PANIC_IF_FALSE(
lwe_array_out->num_radix_blocks >= num_radix_blocks &&
lwe_array_input->num_radix_blocks >= num_radix_blocks,
@@ -47,8 +47,8 @@ __host__ void host_cmux(CudaStreams streams,
CudaRadixCiphertextFFI const *lwe_condition,
CudaRadixCiphertextFFI const *lwe_array_true,
CudaRadixCiphertextFFI const *lwe_array_false,
int_cmux_buffer<Torus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks) {
int_cmux_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks) {
if (lwe_array_out->num_radix_blocks != lwe_array_true->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -94,17 +94,17 @@ __host__ void host_cmux(CudaStreams streams,
mem_ptr->message_extract_lut, num_radix_blocks);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_cmux(CudaStreams streams,
int_cmux_buffer<Torus> **mem_ptr,
int_cmux_buffer<Torus, KSTorus> **mem_ptr,
std::function<Torus(Torus)> predicate_lut_f,
uint32_t num_radix_blocks,
int_radix_params params,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_cmux_buffer<Torus>(streams, predicate_lut_f, params,
num_radix_blocks, allocate_gpu_memory,
size_tracker);
*mem_ptr = new int_cmux_buffer<Torus, KSTorus>(
streams, predicate_lut_f, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
#endif

View File

@@ -19,8 +19,9 @@ uint64_t scratch_cuda_comparison_64(
case EQ:
case NE:
size_tracker += scratch_cuda_comparison_check<uint64_t>(
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, op_type, false, allocate_gpu_memory);
CudaStreams(streams),
(int_comparison_buffer<uint64_t, uint64_t> **)mem_ptr, num_radix_blocks,
params, op_type, false, allocate_gpu_memory);
break;
case GT:
case GE:
@@ -29,8 +30,9 @@ uint64_t scratch_cuda_comparison_64(
case MAX:
case MIN:
size_tracker += scratch_cuda_comparison_check<uint64_t>(
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, op_type, is_signed, allocate_gpu_memory);
CudaStreams(streams),
(int_comparison_buffer<uint64_t, uint64_t> **)mem_ptr, num_radix_blocks,
params, op_type, is_signed, allocate_gpu_memory);
break;
}
POP_RANGE()
@@ -50,8 +52,8 @@ void cuda_comparison_ciphertext_64(CudaStreamsFFI streams,
// depending on the case (eq/gt vs max/min) so the amount of blocks to
// consider for calculation is the one of the input
auto num_radix_blocks = lwe_array_1->num_radix_blocks;
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
int_comparison_buffer<uint64_t, uint64_t> *buffer =
(int_comparison_buffer<uint64_t, uint64_t> *)mem_ptr;
switch (buffer->op) {
case EQ:
case NE:
@@ -88,8 +90,8 @@ void cuda_comparison_ciphertext_64(CudaStreamsFFI streams,
void cleanup_cuda_integer_comparison(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup comparison")
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
int_comparison_buffer<uint64_t, uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -111,8 +113,9 @@ uint64_t scratch_cuda_integer_are_all_comparisons_block_true_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_comparison_check<uint64_t>(
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
CudaStreams(streams),
(int_comparison_buffer<uint64_t, uint64_t> **)mem_ptr, num_radix_blocks,
params, EQ, false, allocate_gpu_memory);
}
void cuda_integer_are_all_comparisons_block_true_64(
@@ -120,8 +123,8 @@ void cuda_integer_are_all_comparisons_block_true_64(
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
int_comparison_buffer<uint64_t, uint64_t> *buffer =
(int_comparison_buffer<uint64_t, uint64_t> *)mem_ptr;
host_integer_are_all_comparisons_block_true<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
@@ -131,8 +134,8 @@ void cuda_integer_are_all_comparisons_block_true_64(
void cleanup_cuda_integer_are_all_comparisons_block_true(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
int_comparison_buffer<uint64_t, uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -153,8 +156,9 @@ uint64_t scratch_cuda_integer_is_at_least_one_comparisons_block_true_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_comparison_check<uint64_t>(
CudaStreams(streams), (int_comparison_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, EQ, false, allocate_gpu_memory);
CudaStreams(streams),
(int_comparison_buffer<uint64_t, uint64_t> **)mem_ptr, num_radix_blocks,
params, EQ, false, allocate_gpu_memory);
}
void cuda_integer_is_at_least_one_comparisons_block_true_64(
@@ -162,8 +166,8 @@ void cuda_integer_is_at_least_one_comparisons_block_true_64(
CudaRadixCiphertextFFI const *lwe_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *ksks, uint32_t num_radix_blocks) {
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
int_comparison_buffer<uint64_t, uint64_t> *buffer =
(int_comparison_buffer<uint64_t, uint64_t> *)mem_ptr;
host_integer_is_at_least_one_comparisons_block_true<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in, buffer, bsks,
@@ -173,8 +177,8 @@ void cuda_integer_is_at_least_one_comparisons_block_true_64(
void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_comparison_buffer<uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t> *)(*mem_ptr_void);
int_comparison_buffer<uint64_t, uint64_t> *mem_ptr =
(int_comparison_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -60,7 +60,7 @@ template <typename Torus, typename KSTorus>
__host__ void are_all_comparisons_block_true(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
@@ -123,7 +123,7 @@ __host__ void are_all_comparisons_block_true(
auto accumulator = are_all_block_true_buffer->tmp_block_accumulated;
// Selects a LUT
int_radix_lut<Torus> *lut;
int_radix_lut<Torus, KSTorus> *lut;
if (are_all_block_true_buffer->op == COMPARISON_TYPE::NE) {
// is_non_zero_lut_buffer LUT
lut = mem_ptr->eq_buffer->is_non_zero_lut;
@@ -195,7 +195,7 @@ template <typename Torus, typename KSTorus>
__host__ void is_at_least_one_comparisons_block_true(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
@@ -243,7 +243,7 @@ __host__ void is_at_least_one_comparisons_block_true(
}
// Selects a LUT
int_radix_lut<Torus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
int_radix_lut<Torus, KSTorus> *lut = mem_ptr->eq_buffer->is_non_zero_lut;
// Applies the LUT
if (remaining_blocks == 1) {
@@ -264,9 +264,9 @@ template <typename Torus, typename KSTorus>
__host__ void host_compare_blocks_with_zero(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, int32_t num_radix_blocks,
int_radix_lut<Torus> *zero_comparison) {
int_radix_lut<Torus, KSTorus> *zero_comparison) {
if (num_radix_blocks == 0)
return;
@@ -332,8 +332,9 @@ __host__ void
host_equality_check(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
int_comparison_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks,
uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -360,8 +361,9 @@ __host__ void
compare_radix_blocks(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
int_comparison_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks,
uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_right->lwe_dimension)
@@ -411,7 +413,7 @@ template <typename Torus, typename KSTorus>
__host__ void
tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_block_comparisons,
int_tree_sign_reduction_buffer<Torus> *tree_buffer,
int_tree_sign_reduction_buffer<Torus, KSTorus> *tree_buffer,
std::function<Torus(Torus)> sign_handler_f,
void *const *bsks, KSTorus *const *ksks,
uint32_t num_radix_blocks) {
@@ -501,7 +503,7 @@ __host__ void host_difference_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr,
int_comparison_buffer<Torus, KSTorus> *mem_ptr,
std::function<Torus(Torus)> reduction_lut_f, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
@@ -651,14 +653,14 @@ __host__ void host_difference_check(
bsks, ksks, num_comparisons);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_comparison_check(
CudaStreams streams, int_comparison_buffer<Torus> **mem_ptr,
CudaStreams streams, int_comparison_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, COMPARISON_TYPE op,
bool is_signed, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_comparison_buffer<Torus>(
*mem_ptr = new int_comparison_buffer<Torus, KSTorus>(
streams, op, params, num_radix_blocks, is_signed, allocate_gpu_memory,
size_tracker);
return size_tracker;
@@ -669,7 +671,7 @@ __host__ void
host_maxmin(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_left,
CudaRadixCiphertextFFI const *lwe_array_right,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_left->lwe_dimension ||
@@ -696,7 +698,7 @@ template <typename Torus, typename KSTorus>
__host__ void host_integer_are_all_comparisons_block_true(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
// It returns a block encrypting 1 if all input blocks are 1
@@ -709,7 +711,7 @@ template <typename Torus, typename KSTorus>
__host__ void host_integer_is_at_least_one_comparisons_block_true(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
// It returns a block encrypting 1 if all input blocks are 1

View File

@@ -39,7 +39,7 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
carry_modulus, noise_reduction_type);
return scratch_cuda_integer_decompress_radix_ciphertext<uint64_t>(
CudaStreams(streams), (int_decompression<uint64_t> **)mem_ptr,
CudaStreams(streams), (int_decompression<uint64_t, uint64_t> **)mem_ptr,
num_blocks_to_decompress, encryption_params, compression_params,
allocate_gpu_memory);
}
@@ -57,9 +57,9 @@ void cuda_integer_decompress_radix_ciphertext_64(
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr) {
host_integer_decompress<uint64_t>(CudaStreams(streams), lwe_array_out,
glwe_in, indexes_array, bsks,
(int_decompression<uint64_t> *)mem_ptr);
host_integer_decompress<uint64_t>(
CudaStreams(streams), lwe_array_out, glwe_in, indexes_array, bsks,
(int_decompression<uint64_t, uint64_t> *)mem_ptr);
}
void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
@@ -73,8 +73,8 @@ void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_decompression<uint64_t> *mem_ptr =
(int_decompression<uint64_t> *)(*mem_ptr_void);
int_decompression<uint64_t, uint64_t> *mem_ptr =
(int_decompression<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -112,9 +112,9 @@ uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
PBS_MS_REDUCTION_T::NO_REDUCTION);
return scratch_cuda_integer_decompress_radix_ciphertext<__uint128_t>(
CudaStreams(streams), (int_decompression<__uint128_t> **)mem_ptr,
num_radix_blocks, compression_params, compression_params,
allocate_gpu_memory);
CudaStreams(streams),
(int_decompression<__uint128_t, uint64_t> **)mem_ptr, num_radix_blocks,
compression_params, compression_params, allocate_gpu_memory);
}
void cuda_integer_compress_radix_ciphertext_128(
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
@@ -132,7 +132,7 @@ void cuda_integer_decompress_radix_ciphertext_128(
host_integer_decompress<__uint128_t>(
CudaStreams(streams), lwe_array_out, glwe_in, indexes_array, nullptr,
(int_decompression<__uint128_t> *)mem_ptr);
(int_decompression<__uint128_t, uint64_t> *)mem_ptr);
}
void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
@@ -148,8 +148,8 @@ void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
void cleanup_cuda_integer_decompress_radix_ciphertext_128(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_decompression<__uint128_t> *mem_ptr =
(int_decompression<__uint128_t> *)(*mem_ptr_void);
int_decompression<__uint128_t, uint64_t> *mem_ptr =
(int_decompression<__uint128_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -244,13 +244,13 @@ __host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void
host_integer_decompress(CudaStreams streams,
CudaLweCiphertextListFFI *d_lwe_array_out,
CudaPackedGlweCiphertextListFFI const *d_packed_glwe_in,
uint32_t const *h_indexes_array, void *const *d_bsks,
int_decompression<Torus> *h_mem_ptr) {
int_decompression<Torus, KSTorus> *h_mem_ptr) {
static_assert(std::is_same_v<Torus, uint64_t> ||
std::is_same_v<Torus, __uint128_t>,
@@ -413,14 +413,14 @@ __host__ uint64_t scratch_cuda_compress_ciphertext(
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_integer_decompress_radix_ciphertext(
CudaStreams streams, int_decompression<Torus> **mem_ptr,
CudaStreams streams, int_decompression<Torus, KSTorus> **mem_ptr,
uint32_t num_blocks_to_decompress, int_radix_params encryption_params,
int_radix_params compression_params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_decompression<Torus>(
*mem_ptr = new int_decompression<Torus, KSTorus>(
streams, encryption_params, compression_params, num_blocks_to_decompress,
allocate_gpu_memory, size_tracker);
return size_tracker;

View File

@@ -15,8 +15,9 @@ uint64_t scratch_cuda_integer_div_rem_radix_ciphertext_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_div_rem<uint64_t>(
CudaStreams(streams), is_signed, (int_div_rem_memory<uint64_t> **)mem_ptr,
num_blocks, params, allocate_gpu_memory);
CudaStreams(streams), is_signed,
(int_div_rem_memory<uint64_t, uint64_t> **)mem_ptr, num_blocks, params,
allocate_gpu_memory);
POP_RANGE()
}
@@ -26,19 +27,19 @@ void cuda_integer_div_rem_radix_ciphertext_64(
CudaRadixCiphertextFFI const *divisor, bool is_signed, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {
PUSH_RANGE("div")
auto mem = (int_div_rem_memory<uint64_t> *)mem_ptr;
auto mem = (int_div_rem_memory<uint64_t, uint64_t> *)mem_ptr;
host_integer_div_rem<uint64_t>(CudaStreams(streams), quotient, remainder,
numerator, divisor, is_signed, bsks,
(uint64_t **)(ksks), mem);
host_integer_div_rem<uint64_t, uint64_t>(
CudaStreams(streams), quotient, remainder, numerator, divisor, is_signed,
bsks, (uint64_t **)(ksks), mem);
POP_RANGE()
}
void cleanup_cuda_integer_div_rem(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup div")
int_div_rem_memory<uint64_t> *mem_ptr =
(int_div_rem_memory<uint64_t> *)(*mem_ptr_void);
int_div_rem_memory<uint64_t, uint64_t> *mem_ptr =
(int_div_rem_memory<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -13,24 +13,26 @@
#include "integer/subtraction.cuh"
#include <fstream>
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_integer_div_rem(
CudaStreams streams, bool is_signed, int_div_rem_memory<Torus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory) {
CudaStreams streams, bool is_signed,
int_div_rem_memory<Torus, KSTorus> **mem_ptr, uint32_t num_blocks,
int_radix_params params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr =
new int_div_rem_memory<Torus>(streams, params, is_signed, num_blocks,
allocate_gpu_memory, size_tracker);
*mem_ptr = new int_div_rem_memory<Torus, KSTorus>(
streams, params, is_signed, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
uint64_t *const *ksks, unsigned_int_div_rem_2_2_memory<uint64_t> *mem_ptr) {
uint64_t *const *ksks,
unsigned_int_div_rem_2_2_memory<uint64_t, uint64_t> *mem_ptr) {
if (streams.count() < 4) {
PANIC("GPU count should be greater than 4 when using div_rem_2_2");
@@ -144,31 +146,32 @@ __host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
init_low_rem_f(mem_ptr->low3, mem_ptr->d3, mem_ptr->rem3, remainder_gpu_0,
0, true);
auto sub_result_f = [&](CudaStreams streams, size_t gpu_index,
CudaRadixCiphertextFFI *sub_result,
CudaRadixCiphertextFFI *sub_overflowed,
int_borrow_prop_memory<Torus> *overflow_sub_mem,
CudaRadixCiphertextFFI *low,
CudaRadixCiphertextFFI *rem, Torus *first_indexes,
Torus *second_indexes, Torus *scalar_indexes) {
uint32_t compute_overflow = 1;
uint32_t uses_input_borrow = 0;
sub_result->num_radix_blocks = low->num_radix_blocks;
overflow_sub_mem->update_lut_indexes(
streams.get_ith(gpu_index), first_indexes, second_indexes,
scalar_indexes, rem->num_radix_blocks);
host_integer_overflowing_sub<uint64_t>(
streams.get_ith(gpu_index), sub_result, rem, low, sub_overflowed,
(const CudaRadixCiphertextFFI *)nullptr, overflow_sub_mem,
&bsks[gpu_index], &ksks[gpu_index], compute_overflow,
uses_input_borrow);
};
auto sub_result_f =
[&](CudaStreams streams, size_t gpu_index,
CudaRadixCiphertextFFI *sub_result,
CudaRadixCiphertextFFI *sub_overflowed,
int_borrow_prop_memory<Torus, KSTorus> *overflow_sub_mem,
CudaRadixCiphertextFFI *low, CudaRadixCiphertextFFI *rem,
Torus *first_indexes, Torus *second_indexes,
Torus *scalar_indexes) {
uint32_t compute_overflow = 1;
uint32_t uses_input_borrow = 0;
sub_result->num_radix_blocks = low->num_radix_blocks;
overflow_sub_mem->update_lut_indexes(
streams.get_ith(gpu_index), first_indexes, second_indexes,
scalar_indexes, rem->num_radix_blocks);
host_integer_overflowing_sub<uint64_t>(
streams.get_ith(gpu_index), sub_result, rem, low, sub_overflowed,
(const CudaRadixCiphertextFFI *)nullptr, overflow_sub_mem,
&bsks[gpu_index], &ksks[gpu_index], compute_overflow,
uses_input_borrow);
};
auto cmp_f = [&](CudaStreams streams, size_t gpu_index,
CudaRadixCiphertextFFI *out_boolean_block,
CudaRadixCiphertextFFI *comparison_blocks,
CudaRadixCiphertextFFI *d,
int_comparison_buffer<Torus> *comparison_buffer) {
int_comparison_buffer<Torus, KSTorus> *comparison_buffer) {
CudaRadixCiphertextFFI d_msb;
uint32_t slice_start = num_blocks - block_index;
uint32_t slice_end = d->num_radix_blocks;
@@ -328,7 +331,8 @@ __host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
auto conditional_update = [&](CudaStreams streams, size_t gpu_index,
CudaRadixCiphertextFFI *cx,
CudaRadixCiphertextFFI *rx,
int_radix_lut<Torus> *lut, Torus factor) {
int_radix_lut<Torus, KSTorus> *lut,
Torus factor) {
auto rx_list = to_lwe_ciphertext_list(rx);
host_cleartext_multiplication<Torus>(streams.stream(gpu_index),
streams.gpu_index(gpu_index),
@@ -467,12 +471,13 @@ __host__ void host_unsigned_integer_div_rem_block_by_block_2_2(
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unsigned_integer_div_rem(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, void *const *bsks,
uint64_t *const *ksks, unsigned_int_div_rem_memory<uint64_t> *mem_ptr) {
uint64_t *const *ksks,
unsigned_int_div_rem_memory<uint64_t, uint64_t> *mem_ptr) {
if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
remainder->num_radix_blocks != divisor->num_radix_blocks ||
@@ -485,7 +490,7 @@ __host__ void host_unsigned_integer_div_rem(
if (mem_ptr->params.message_modulus == 4 &&
mem_ptr->params.carry_modulus == 4 && streams.count() >= 4) {
host_unsigned_integer_div_rem_block_by_block_2_2<Torus>(
host_unsigned_integer_div_rem_block_by_block_2_2<Torus, KSTorus>(
streams, quotient, remainder, numerator, divisor, bsks, ksks,
mem_ptr->div_rem_2_2_mem);
return;
@@ -897,12 +902,14 @@ __host__ void host_unsigned_integer_div_rem(
mem_ptr->sub_streams_2.synchronize();
}
template <typename Torus>
__host__ void host_integer_div_rem(
CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder, CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed, void *const *bsks,
uint64_t *const *ksks, int_div_rem_memory<uint64_t> *int_mem_ptr) {
template <typename Torus, typename KSTorus>
__host__ void
host_integer_div_rem(CudaStreams streams, CudaRadixCiphertextFFI *quotient,
CudaRadixCiphertextFFI *remainder,
CudaRadixCiphertextFFI const *numerator,
CudaRadixCiphertextFFI const *divisor, bool is_signed,
void *const *bsks, uint64_t *const *ksks,
int_div_rem_memory<uint64_t, uint64_t> *int_mem_ptr) {
if (remainder->num_radix_blocks != numerator->num_radix_blocks ||
remainder->num_radix_blocks != divisor->num_radix_blocks ||
remainder->num_radix_blocks != quotient->num_radix_blocks)
@@ -934,7 +941,7 @@ __host__ void host_integer_div_rem(
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
host_unsigned_integer_div_rem<Torus>(
host_unsigned_integer_div_rem<Torus, KSTorus>(
int_mem_ptr->sub_streams_1, quotient, remainder, positive_numerator,
positive_divisor, bsks, ksks, int_mem_ptr->unsigned_mem);
@@ -944,7 +951,7 @@ __host__ void host_integer_div_rem(
CudaRadixCiphertextFFI divisor_sign;
as_radix_ciphertext_slice<Torus>(&divisor_sign, divisor, num_blocks - 1,
num_blocks);
integer_radix_apply_bivariate_lookup_table<Torus>(
integer_radix_apply_bivariate_lookup_table<Torus, KSTorus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->sign_bits_are_different,
&numerator_sign, &divisor_sign, bsks, ksks,
int_mem_ptr->compare_signed_bits_lut, 1,
@@ -959,35 +966,36 @@ __host__ void host_integer_div_rem(
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_1,
int_mem_ptr->negated_quotient, nullptr,
nullptr, int_mem_ptr->scp_mem_1, bsks,
ksks, requested_flag, uses_carry);
host_propagate_single_carry<Torus, KSTorus>(
int_mem_ptr->sub_streams_1, int_mem_ptr->negated_quotient, nullptr,
nullptr, int_mem_ptr->scp_mem_1, bsks, ksks, requested_flag,
uses_carry);
host_negation<Torus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, remainder,
radix_params.message_modulus, radix_params.carry_modulus, num_blocks);
host_propagate_single_carry<Torus>(int_mem_ptr->sub_streams_2,
int_mem_ptr->negated_remainder, nullptr,
nullptr, int_mem_ptr->scp_mem_2, bsks,
ksks, requested_flag, uses_carry);
host_propagate_single_carry<Torus, KSTorus>(
int_mem_ptr->sub_streams_2, int_mem_ptr->negated_remainder, nullptr,
nullptr, int_mem_ptr->scp_mem_2, bsks, ksks, requested_flag,
uses_carry);
host_cmux<Torus>(int_mem_ptr->sub_streams_1, quotient,
int_mem_ptr->sign_bits_are_different,
int_mem_ptr->negated_quotient, quotient,
int_mem_ptr->cmux_quotient_mem, bsks, ksks);
host_cmux<Torus, KSTorus>(int_mem_ptr->sub_streams_1, quotient,
int_mem_ptr->sign_bits_are_different,
int_mem_ptr->negated_quotient, quotient,
int_mem_ptr->cmux_quotient_mem, bsks, ksks);
host_cmux<Torus>(int_mem_ptr->sub_streams_2, remainder, &numerator_sign,
int_mem_ptr->negated_remainder, remainder,
int_mem_ptr->cmux_remainder_mem, bsks, ksks);
host_cmux<Torus, KSTorus>(int_mem_ptr->sub_streams_2, remainder,
&numerator_sign, int_mem_ptr->negated_remainder,
remainder, int_mem_ptr->cmux_remainder_mem, bsks,
ksks);
int_mem_ptr->sub_streams_1.synchronize();
int_mem_ptr->sub_streams_2.synchronize();
} else {
host_unsigned_integer_div_rem<Torus>(streams, quotient, remainder,
numerator, divisor, bsks, ksks,
int_mem_ptr->unsigned_mem);
host_unsigned_integer_div_rem<Torus, KSTorus>(
streams, quotient, remainder, numerator, divisor, bsks, ksks,
int_mem_ptr->unsigned_mem);
}
}

View File

@@ -15,10 +15,11 @@ uint64_t scratch_integer_count_of_consecutive_bits_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_integer_count_of_consecutive_bits<uint64_t>(
return scratch_integer_count_of_consecutive_bits<uint64_t, uint64_t>(
CudaStreams(streams), params,
(int_count_of_consecutive_bits_buffer<uint64_t> **)mem_ptr, num_blocks,
counter_num_blocks, direction, bit_value, allocate_gpu_memory);
(int_count_of_consecutive_bits_buffer<uint64_t, uint64_t> **)mem_ptr,
num_blocks, counter_num_blocks, direction, bit_value,
allocate_gpu_memory);
}
// Computes the number of consecutive bits in an encrypted integer.
@@ -33,15 +34,16 @@ void cuda_integer_count_of_consecutive_bits_64(
host_integer_count_of_consecutive_bits<uint64_t, uint64_t>(
CudaStreams(streams), output_ct, input_ct,
(int_count_of_consecutive_bits_buffer<uint64_t> *)mem_ptr, bsks,
(int_count_of_consecutive_bits_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks);
}
void cleanup_cuda_integer_count_of_consecutive_bits_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_count_of_consecutive_bits_buffer<uint64_t> *mem_ptr =
(int_count_of_consecutive_bits_buffer<uint64_t> *)(*mem_ptr_void);
int_count_of_consecutive_bits_buffer<uint64_t, uint64_t> *mem_ptr =
(int_count_of_consecutive_bits_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -64,10 +66,10 @@ uint64_t scratch_integer_ilog2_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_integer_ilog2<uint64_t>(
CudaStreams(streams), params, (int_ilog2_buffer<uint64_t> **)mem_ptr,
input_num_blocks, counter_num_blocks, num_bits_in_ciphertext,
allocate_gpu_memory);
return scratch_integer_ilog2<uint64_t, uint64_t>(
CudaStreams(streams), params,
(int_ilog2_buffer<uint64_t, uint64_t> **)mem_ptr, input_num_blocks,
counter_num_blocks, num_bits_in_ciphertext, allocate_gpu_memory);
}
// Computes the integer logarithm base 2 of an encrypted integer.
@@ -84,15 +86,15 @@ void cuda_integer_ilog2_64(
host_integer_ilog2<uint64_t, uint64_t>(
CudaStreams(streams), output_ct, input_ct, trivial_ct_neg_n, trivial_ct_2,
trivial_ct_m_minus_1_block, (int_ilog2_buffer<uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks);
trivial_ct_m_minus_1_block,
(int_ilog2_buffer<uint64_t, uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
}
void cleanup_cuda_integer_ilog2_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_ilog2_buffer<uint64_t> *mem_ptr =
(int_ilog2_buffer<uint64_t> *)(*mem_ptr_void);
int_ilog2_buffer<uint64_t, uint64_t> *mem_ptr =
(int_ilog2_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));

View File

@@ -9,7 +9,7 @@
template <typename Torus, typename KSTorus>
__host__ void host_integer_prepare_count_of_consecutive_bits(
CudaStreams streams, CudaRadixCiphertextFFI *ciphertext,
int_prepare_count_of_consecutive_bits_buffer<Torus> *mem_ptr,
int_prepare_count_of_consecutive_bits_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks) {
auto tmp = mem_ptr->tmp_ct;
@@ -26,16 +26,16 @@ __host__ void host_integer_prepare_count_of_consecutive_bits(
ciphertext->num_radix_blocks);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_integer_count_of_consecutive_bits(
CudaStreams streams, const int_radix_params params,
int_count_of_consecutive_bits_buffer<Torus> **mem_ptr,
int_count_of_consecutive_bits_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, uint32_t counter_num_blocks, Direction direction,
BitValue bit_value, const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_count_of_consecutive_bits_buffer<Torus>(
*mem_ptr = new int_count_of_consecutive_bits_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, counter_num_blocks, direction,
bit_value, allocate_gpu_memory, size_tracker);
@@ -46,8 +46,8 @@ template <typename Torus, typename KSTorus>
__host__ void host_integer_count_of_consecutive_bits(
CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *input_ct,
int_count_of_consecutive_bits_buffer<Torus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks) {
int_count_of_consecutive_bits_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks) {
auto params = mem_ptr->params;
auto ct_prepared = mem_ptr->ct_prepared;
@@ -80,18 +80,16 @@ __host__ void host_integer_count_of_consecutive_bits(
mem_ptr->propagate_mem, bsks, ksks, 0, 0);
}
template <typename Torus>
__host__ uint64_t scratch_integer_ilog2(CudaStreams streams,
const int_radix_params params,
int_ilog2_buffer<Torus> **mem_ptr,
uint32_t input_num_blocks,
uint32_t counter_num_blocks,
uint32_t num_bits_in_ciphertext,
const bool allocate_gpu_memory) {
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_integer_ilog2(
CudaStreams streams, const int_radix_params params,
int_ilog2_buffer<Torus, KSTorus> **mem_ptr, uint32_t input_num_blocks,
uint32_t counter_num_blocks, uint32_t num_bits_in_ciphertext,
const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_ilog2_buffer<Torus>(
*mem_ptr = new int_ilog2_buffer<Torus, KSTorus>(
streams, params, input_num_blocks, counter_num_blocks,
num_bits_in_ciphertext, allocate_gpu_memory, size_tracker);
@@ -105,7 +103,7 @@ host_integer_ilog2(CudaStreams streams, CudaRadixCiphertextFFI *output_ct,
CudaRadixCiphertextFFI const *trivial_ct_neg_n,
CudaRadixCiphertextFFI const *trivial_ct_2,
CudaRadixCiphertextFFI const *trivial_ct_m_minus_1_block,
int_ilog2_buffer<Torus> *mem_ptr, void *const *bsks,
int_ilog2_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks) {
// Prepare the input ciphertext by computing the number of consecutive

View File

@@ -6,12 +6,12 @@ void cuda_full_propagation_64_inplace(CudaStreamsFFI streams,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks) {
int_fullprop_buffer<uint64_t> *buffer =
(int_fullprop_buffer<uint64_t> *)mem_ptr;
int_fullprop_buffer<uint64_t, uint64_t> *buffer =
(int_fullprop_buffer<uint64_t, uint64_t> *)mem_ptr;
host_full_propagate_inplace<uint64_t>(CudaStreams(streams), input_blocks,
buffer, (uint64_t **)(ksks), bsks,
num_blocks);
host_full_propagate_inplace<uint64_t, uint64_t>(
CudaStreams(streams), input_blocks, buffer, (uint64_t **)(ksks), bsks,
num_blocks);
}
uint64_t scratch_cuda_full_propagation_64(
@@ -27,16 +27,16 @@ uint64_t scratch_cuda_full_propagation_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_full_propagation<uint64_t>(
CudaStreams(streams), (int_fullprop_buffer<uint64_t> **)mem_ptr, params,
allocate_gpu_memory);
return scratch_cuda_full_propagation<uint64_t, uint64_t>(
CudaStreams(streams), (int_fullprop_buffer<uint64_t, uint64_t> **)mem_ptr,
params, allocate_gpu_memory);
}
void cleanup_cuda_full_propagation(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_fullprop_buffer<uint64_t> *mem_ptr =
(int_fullprop_buffer<uint64_t> *)(*mem_ptr_void);
int_fullprop_buffer<uint64_t, uint64_t> *mem_ptr =
(int_fullprop_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
@@ -56,8 +56,8 @@ uint64_t scratch_cuda_propagate_single_carry_64_inplace(
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
return scratch_cuda_propagate_single_carry_inplace<uint64_t, uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t, uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, allocate_gpu_memory);
}
@@ -75,7 +75,7 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_64_inplace(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_propagate_single_carry_inplace<uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
CudaStreams(streams), (int_sc_prop_memory<uint64_t, uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, allocate_gpu_memory);
}
@@ -93,8 +93,9 @@ uint64_t scratch_cuda_integer_overflowing_sub_64_inplace(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_integer_overflowing_sub<uint64_t>(
CudaStreams(streams), (int_borrow_prop_memory<uint64_t> **)mem_ptr,
num_blocks, params, compute_overflow, allocate_gpu_memory);
CudaStreams(streams),
(int_borrow_prop_memory<uint64_t, uint64_t> **)mem_ptr, num_blocks,
params, compute_overflow, allocate_gpu_memory);
}
void cuda_propagate_single_carry_64_inplace(
@@ -105,8 +106,8 @@ void cuda_propagate_single_carry_64_inplace(
host_propagate_single_carry<uint64_t>(
CudaStreams(streams), lwe_array, carry_out, carry_in,
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
requested_flag, uses_carry);
(int_sc_prop_memory<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), requested_flag, uses_carry);
}
void cuda_add_and_propagate_single_carry_64_inplace(
@@ -117,8 +118,8 @@ void cuda_add_and_propagate_single_carry_64_inplace(
host_add_and_propagate_single_carry<uint64_t>(
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
(int_sc_prop_memory<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
requested_flag, uses_carry);
(int_sc_prop_memory<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), requested_flag, uses_carry);
}
void cuda_integer_overflowing_sub_64_inplace(
@@ -131,7 +132,7 @@ void cuda_integer_overflowing_sub_64_inplace(
PUSH_RANGE("overflow sub")
host_integer_overflowing_sub<uint64_t>(
CudaStreams(streams), lhs_array, lhs_array, rhs_array, overflow_block,
input_borrow, (int_borrow_prop_memory<uint64_t> *)mem_ptr, bsks,
input_borrow, (int_borrow_prop_memory<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, compute_overflow, uses_input_borrow);
POP_RANGE()
}
@@ -139,8 +140,8 @@ void cuda_integer_overflowing_sub_64_inplace(
void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup propagate sc")
int_sc_prop_memory<uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
int_sc_prop_memory<uint64_t, uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -150,8 +151,8 @@ void cleanup_cuda_propagate_single_carry(CudaStreamsFFI streams,
void cleanup_cuda_add_and_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup add & propagate sc")
int_sc_prop_memory<uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t> *)(*mem_ptr_void);
int_sc_prop_memory<uint64_t, uint64_t> *mem_ptr =
(int_sc_prop_memory<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -160,8 +161,8 @@ void cleanup_cuda_add_and_propagate_single_carry(CudaStreamsFFI streams,
void cleanup_cuda_integer_overflowing_sub(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup overflow sub")
int_borrow_prop_memory<uint64_t> *mem_ptr =
(int_borrow_prop_memory<uint64_t> *)(*mem_ptr_void);
int_borrow_prop_memory<uint64_t, uint64_t> *mem_ptr =
(int_borrow_prop_memory<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -182,8 +183,8 @@ uint64_t scratch_cuda_apply_univariate_lut_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_apply_univariate_lut<uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
return scratch_cuda_apply_univariate_lut<uint64_t, uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t, uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
lut_degree, allocate_gpu_memory);
}
@@ -202,8 +203,8 @@ uint64_t scratch_cuda_apply_many_univariate_lut_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_cuda_apply_many_univariate_lut<uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t> **)mem_ptr,
return scratch_cuda_apply_many_univariate_lut<uint64_t, uint64_t>(
CudaStreams(streams), (int_radix_lut<uint64_t, uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
num_many_lut, lut_degree, allocate_gpu_memory);
}
@@ -214,15 +215,16 @@ void cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks) {
host_apply_univariate_lut<uint64_t>(
host_apply_univariate_lut<uint64_t, uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks);
(int_radix_lut<uint64_t, uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks);
}
void cleanup_cuda_apply_univariate_lut_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup univar lut")
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
int_radix_lut<uint64_t, uint64_t> *mem_ptr =
(int_radix_lut<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;
@@ -235,9 +237,9 @@ void cuda_apply_many_univariate_lut_64(
void *const *ksks, void *const *bsks, uint32_t num_many_lut,
uint32_t lut_stride) {
host_apply_many_univariate_lut<uint64_t>(
host_apply_many_univariate_lut<uint64_t, uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
(int_radix_lut<uint64_t, uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
num_many_lut, lut_stride);
}
@@ -266,12 +268,13 @@ void reverseArray(uint64_t arr[], size_t n) {
uint64_t scratch_cuda_apply_noise_squashing_mem(
CudaStreamsFFI streams, int_radix_params params,
int_noise_squashing_lut<uint64_t> **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t num_radix_blocks,
uint32_t original_num_blocks, bool allocate_gpu_memory) {
int_noise_squashing_lut<uint64_t, uint64_t> **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t num_radix_blocks, uint32_t original_num_blocks,
bool allocate_gpu_memory) {
PUSH_RANGE("scratch noise squashing")
uint64_t size_tracker = 0;
*mem_ptr = new int_noise_squashing_lut<uint64_t>(
*mem_ptr = new int_noise_squashing_lut<uint64_t, uint64_t>(
CudaStreams(streams), params, glwe_dimension, polynomial_size,
num_radix_blocks, original_num_blocks, allocate_gpu_memory, size_tracker);
POP_RANGE()
@@ -294,7 +297,7 @@ uint64_t scratch_cuda_apply_noise_squashing(
noise_reduction_type);
return scratch_cuda_apply_noise_squashing_mem(
streams, params, (int_noise_squashing_lut<uint64_t> **)mem_ptr,
streams, params, (int_noise_squashing_lut<uint64_t, uint64_t> **)mem_ptr,
input_glwe_dimension, input_polynomial_size, num_radix_blocks,
original_num_blocks, allocate_gpu_memory);
}
@@ -306,17 +309,18 @@ void cuda_apply_noise_squashing(CudaStreamsFFI streams,
void *const *bsks) {
PUSH_RANGE("apply noise squashing")
integer_radix_apply_noise_squashing<uint64_t>(
integer_radix_apply_noise_squashing<uint64_t, uint64_t>(
CudaStreams(streams), output_radix_lwe, input_radix_lwe,
(int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks);
(int_noise_squashing_lut<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks);
POP_RANGE()
}
void cleanup_cuda_apply_noise_squashing(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup noise squashing")
int_noise_squashing_lut<uint64_t> *mem_ptr =
(int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
int_noise_squashing_lut<uint64_t, uint64_t> *mem_ptr =
(int_noise_squashing_lut<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -506,7 +506,7 @@ template <typename Torus, typename KSTorus>
__host__ void integer_radix_apply_univariate_lookup_table(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
KSTorus *const *ksks, int_radix_lut<Torus> *lut,
KSTorus *const *ksks, int_radix_lut<Torus, KSTorus> *lut,
uint32_t num_radix_blocks) {
PUSH_RANGE("apply lut")
// apply_lookup_table
@@ -538,7 +538,7 @@ __host__ void integer_radix_apply_univariate_lookup_table(
/// For multi GPU execution we create vectors of pointers for inputs and
/// outputs
std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<KSTorus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
@@ -612,8 +612,8 @@ template <typename Torus, typename KSTorus>
__host__ void integer_radix_apply_many_univariate_lookup_table(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks,
KSTorus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_many_lut,
uint32_t lut_stride) {
KSTorus *const *ksks, int_radix_lut<Torus, KSTorus> *lut,
uint32_t num_many_lut, uint32_t lut_stride) {
PUSH_RANGE("apply many lut")
// apply_lookup_table
auto params = lut->params;
@@ -717,8 +717,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_1,
CudaRadixCiphertextFFI const *lwe_array_2, void *const *bsks,
KSTorus *const *ksks, int_radix_lut<Torus> *lut, uint32_t num_radix_blocks,
uint32_t shift) {
KSTorus *const *ksks, int_radix_lut<Torus, KSTorus> *lut,
uint32_t num_radix_blocks, uint32_t shift) {
PUSH_RANGE("apply bivar lut")
if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
@@ -1275,8 +1275,9 @@ void generate_many_lut_device_accumulator(
template <typename Torus, typename KSTorus>
void host_compute_shifted_blocks_and_states(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
int_shifted_blocks_and_states_memory<Torus> *mem, void *const *bsks,
KSTorus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
int_shifted_blocks_and_states_memory<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks, uint32_t lut_stride,
uint32_t num_many_lut) {
auto num_radix_blocks = lwe_array->num_radix_blocks;
@@ -1303,7 +1304,7 @@ template <typename Torus, typename KSTorus>
void host_resolve_group_carries_sequentially(
CudaStreams streams, CudaRadixCiphertextFFI *resolved_carries,
CudaRadixCiphertextFFI *grouping_pgns, int_radix_params params,
int_seq_group_prop_memory<Torus> *mem, void *const *bsks,
int_seq_group_prop_memory<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks, uint32_t num_groups) {
auto group_resolved_carries = mem->group_resolved_carries;
@@ -1370,8 +1371,9 @@ void host_resolve_group_carries_sequentially(
template <typename Torus, typename KSTorus>
void host_compute_prefix_sum_hillis_steele(
CudaStreams streams, CudaRadixCiphertextFFI *step_output,
CudaRadixCiphertextFFI *generates_or_propagates, int_radix_lut<Torus> *luts,
void *const *bsks, KSTorus *const *ksks, uint32_t num_radix_blocks) {
CudaRadixCiphertextFFI *generates_or_propagates,
int_radix_lut<Torus, KSTorus> *luts, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
if (step_output->lwe_dimension != generates_or_propagates->lwe_dimension)
PANIC("Cuda error: input lwe dimensions must be the same")
@@ -1413,9 +1415,9 @@ void host_compute_prefix_sum_hillis_steele(
template <typename Torus, typename KSTorus>
void host_compute_propagation_simulators_and_group_carries(
CudaStreams streams, CudaRadixCiphertextFFI *block_states,
int_radix_params params, int_prop_simu_group_carries_memory<Torus> *mem,
void *const *bsks, KSTorus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_groups) {
int_radix_params params,
int_prop_simu_group_carries_memory<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks, uint32_t num_groups) {
if (num_radix_blocks > block_states->num_radix_blocks)
PANIC("Cuda error: input does not have enough radix blocks")
@@ -1475,8 +1477,9 @@ void host_compute_propagation_simulators_and_group_carries(
template <typename Torus, typename KSTorus>
void host_compute_shifted_blocks_and_borrow_states(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
int_shifted_blocks_and_borrow_states_memory<Torus> *mem, void *const *bsks,
KSTorus *const *ksks, uint32_t lut_stride, uint32_t num_many_lut) {
int_shifted_blocks_and_borrow_states_memory<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks, uint32_t lut_stride,
uint32_t num_many_lut) {
auto num_radix_blocks = lwe_array->num_radix_blocks;
auto shifted_blocks_and_borrow_states = mem->shifted_blocks_and_borrow_states;
@@ -1508,7 +1511,7 @@ void host_compute_shifted_blocks_and_borrow_states(
template <typename Torus, typename KSTorus>
void host_full_propagate_inplace(CudaStreams streams,
CudaRadixCiphertextFFI *input_blocks,
int_fullprop_buffer<Torus> *mem_ptr,
int_fullprop_buffer<Torus, KSTorus> *mem_ptr,
KSTorus *const *ksks, void *const *bsks,
uint32_t num_blocks) {
auto params = mem_ptr->lut->params;
@@ -1566,15 +1569,14 @@ void host_full_propagate_inplace(CudaStreams streams,
}
}
template <typename Torus>
uint64_t scratch_cuda_full_propagation(CudaStreams streams,
int_fullprop_buffer<Torus> **mem_ptr,
int_radix_params params,
bool allocate_gpu_memory) {
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_full_propagation(
CudaStreams streams, int_fullprop_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_fullprop_buffer<Torus>(streams, params,
allocate_gpu_memory, size_tracker);
*mem_ptr = new int_fullprop_buffer<Torus, KSTorus>(
streams, params, allocate_gpu_memory, size_tracker);
return size_tracker;
}
@@ -1674,7 +1676,7 @@ extract_n_bits(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
const CudaRadixCiphertextFFI *lwe_array_in, void *const *bsks,
KSTorus *const *ksks, uint32_t effective_num_radix_blocks,
uint32_t num_radix_blocks,
int_bit_extract_luts_buffer<Torus> *bit_extract) {
int_bit_extract_luts_buffer<Torus, KSTorus> *bit_extract) {
copy_radix_ciphertext_slice_async<Torus>(
streams.stream(0), streams.gpu_index(0), lwe_array_out, 0,
@@ -1696,7 +1698,7 @@ template <typename Torus, typename KSTorus>
__host__ void
reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
CudaRadixCiphertextFFI *signs_array_in,
int_comparison_buffer<Torus> *mem_ptr,
int_comparison_buffer<Torus, KSTorus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
KSTorus *const *ksks, uint32_t num_sign_blocks) {
@@ -1796,15 +1798,15 @@ reduce_signs(CudaStreams streams, CudaRadixCiphertextFFI *signs_array_out,
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_apply_univariate_lut(
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree,
bool allocate_gpu_memory) {
CudaStreams streams, int_radix_lut<Torus, KSTorus> **mem_ptr,
Torus const *input_lut, uint32_t num_radix_blocks, int_radix_params params,
uint64_t lut_degree, bool allocate_gpu_memory) {
PUSH_RANGE("scratch univar lut")
uint64_t size_tracker = 0;
*mem_ptr = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
*mem_ptr = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory, size_tracker);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1822,24 +1824,24 @@ template <typename Torus, typename KSTorus>
void host_apply_univariate_lut(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in,
int_radix_lut<Torus> *mem, KSTorus *const *ksks,
void *const *bsks) {
int_radix_lut<Torus, KSTorus> *mem,
KSTorus *const *ksks, void *const *bsks) {
integer_radix_apply_univariate_lookup_table<Torus>(
streams, radix_lwe_out, radix_lwe_in, bsks, ksks, mem,
radix_lwe_out->num_radix_blocks);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_apply_many_univariate_lut(
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
uint32_t num_radix_blocks, int_radix_params params, uint32_t num_many_lut,
uint64_t lut_degree, bool allocate_gpu_memory) {
CudaStreams streams, int_radix_lut<Torus, KSTorus> **mem_ptr,
Torus const *input_lut, uint32_t num_radix_blocks, int_radix_params params,
uint32_t num_many_lut, uint64_t lut_degree, bool allocate_gpu_memory) {
PUSH_RANGE("scratch many lut")
uint64_t size_tracker = 0;
*mem_ptr =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
num_many_lut, allocate_gpu_memory, size_tracker);
*mem_ptr = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, num_many_lut, allocate_gpu_memory,
size_tracker);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1857,7 +1859,7 @@ template <typename Torus, typename KSTorus>
void host_apply_many_univariate_lut(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in,
int_radix_lut<Torus> *mem,
int_radix_lut<Torus, KSTorus> *mem,
KSTorus *const *ksks, void *const *bsks,
uint32_t num_many_lut,
uint32_t lut_stride) {
@@ -1867,15 +1869,15 @@ void host_apply_many_univariate_lut(CudaStreams streams,
lut_stride);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_apply_bivariate_lut(
CudaStreams streams, int_radix_lut<Torus> **mem_ptr, Torus const *input_lut,
uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree,
bool allocate_gpu_memory) {
CudaStreams streams, int_radix_lut<Torus, KSTorus> **mem_ptr,
Torus const *input_lut, uint32_t num_radix_blocks, int_radix_params params,
uint64_t lut_degree, bool allocate_gpu_memory) {
PUSH_RANGE("scratch bivar lut")
uint64_t size_tracker = 0;
*mem_ptr = new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);
*mem_ptr = new int_radix_lut<Torus, KSTorus>(
streams, params, 1, num_radix_blocks, allocate_gpu_memory, size_tracker);
// It is safe to do this copy on GPU 0, because all LUTs always reside on GPU
// 0
cuda_memcpy_with_size_tracking_async_to_gpu(
@@ -1894,25 +1896,25 @@ void host_apply_bivariate_lut(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_in_1,
CudaRadixCiphertextFFI const *radix_lwe_in_2,
int_radix_lut<Torus> *mem, KSTorus *const *ksks,
void *const *bsks, uint32_t num_radix_blocks,
uint32_t shift) {
int_radix_lut<Torus, KSTorus> *mem,
KSTorus *const *ksks, void *const *bsks,
uint32_t num_radix_blocks, uint32_t shift) {
integer_radix_apply_bivariate_lookup_table<Torus>(
streams, radix_lwe_out, radix_lwe_in_1, radix_lwe_in_2, bsks, ksks, mem,
num_radix_blocks, shift);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_propagate_single_carry_inplace(
CudaStreams streams, int_sc_prop_memory<Torus> **mem_ptr,
CudaStreams streams, int_sc_prop_memory<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
bool allocate_gpu_memory) {
PUSH_RANGE("scratch add & propagate sc")
uint64_t size_tracker = 0;
*mem_ptr = new int_sc_prop_memory<Torus>(streams, params, num_radix_blocks,
requested_flag, allocate_gpu_memory,
size_tracker);
*mem_ptr = new int_sc_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
POP_RANGE()
return size_tracker;
}
@@ -1923,7 +1925,7 @@ void host_propagate_single_carry(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries,
int_sc_prop_memory<Torus> *mem,
int_sc_prop_memory<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks,
uint32_t requested_flag, uint32_t uses_carry) {
PUSH_RANGE("propagate sc")
@@ -2024,9 +2026,9 @@ template <typename Torus, typename KSTorus>
void host_add_and_propagate_single_carry(
CudaStreams streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries, int_sc_prop_memory<Torus> *mem,
void *const *bsks, KSTorus *const *ksks, uint32_t requested_flag,
uint32_t uses_carry) {
const CudaRadixCiphertextFFI *input_carries,
int_sc_prop_memory<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
PUSH_RANGE("add & propagate sc")
if (lhs_array->num_radix_blocks != rhs_array->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be the same")
@@ -2166,14 +2168,14 @@ void host_add_and_propagate_single_carry(
POP_RANGE()
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_integer_overflowing_sub(
CudaStreams streams, int_borrow_prop_memory<Torus> **mem_ptr,
CudaStreams streams, int_borrow_prop_memory<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
uint32_t compute_overflow, bool allocate_gpu_memory) {
PUSH_RANGE("scratch overflow sub")
uint64_t size_tracker = 0;
*mem_ptr = new int_borrow_prop_memory<Torus>(
*mem_ptr = new int_borrow_prop_memory<Torus, KSTorus>(
streams, params, num_radix_blocks, compute_overflow, allocate_gpu_memory,
size_tracker);
POP_RANGE()
@@ -2187,7 +2189,7 @@ void host_single_borrow_propagate(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow,
int_borrow_prop_memory<Torus> *mem,
int_borrow_prop_memory<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks,
uint32_t num_groups,
uint32_t compute_overflow,
@@ -2296,12 +2298,11 @@ void host_single_borrow_propagate(CudaStreams streams,
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
/// the input and output numbers of blocks
template <typename InputTorus, typename KSTorus>
__host__ void
integer_radix_apply_noise_squashing(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_noise_squashing_lut<InputTorus> *lut,
void *const *bsks, KSTorus *const *ksks) {
__host__ void integer_radix_apply_noise_squashing(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
int_noise_squashing_lut<InputTorus, KSTorus> *lut, void *const *bsks,
KSTorus *const *ksks) {
PUSH_RANGE("apply noise squashing")
auto params = lut->params;

View File

@@ -88,7 +88,7 @@ uint64_t scratch_cuda_integer_mult_radix_ciphertext_64(
case 8192:
case 16384:
return scratch_cuda_integer_mult_radix_ciphertext<uint64_t>(
CudaStreams(streams), (int_mul_memory<uint64_t> **)mem_ptr,
CudaStreams(streams), (int_mul_memory<uint64_t, uint64_t> **)mem_ptr,
is_boolean_left, is_boolean_right, num_radix_blocks, params,
allocate_gpu_memory);
default:
@@ -133,46 +133,46 @@ void cuda_integer_mult_radix_ciphertext_64(
PUSH_RANGE("mul")
switch (polynomial_size) {
case 256:
host_integer_mult_radix<uint64_t, AmortizedDegree<256>>(
host_integer_mult_radix<uint64_t, uint64_t, AmortizedDegree<256>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
(int_mul_memory<uint64_t, uint64_t> *)mem_ptr, num_blocks);
break;
case 512:
host_integer_mult_radix<uint64_t, AmortizedDegree<512>>(
host_integer_mult_radix<uint64_t, uint64_t, AmortizedDegree<512>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
(int_mul_memory<uint64_t, uint64_t> *)mem_ptr, num_blocks);
break;
case 1024:
host_integer_mult_radix<uint64_t, AmortizedDegree<1024>>(
host_integer_mult_radix<uint64_t, uint64_t, AmortizedDegree<1024>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
(int_mul_memory<uint64_t, uint64_t> *)mem_ptr, num_blocks);
break;
case 2048:
host_integer_mult_radix<uint64_t, AmortizedDegree<2048>>(
host_integer_mult_radix<uint64_t, uint64_t, AmortizedDegree<2048>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
(int_mul_memory<uint64_t, uint64_t> *)mem_ptr, num_blocks);
break;
case 4096:
host_integer_mult_radix<uint64_t, AmortizedDegree<4096>>(
host_integer_mult_radix<uint64_t, uint64_t, AmortizedDegree<4096>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
(int_mul_memory<uint64_t, uint64_t> *)mem_ptr, num_blocks);
break;
case 8192:
host_integer_mult_radix<uint64_t, AmortizedDegree<8192>>(
host_integer_mult_radix<uint64_t, uint64_t, AmortizedDegree<8192>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
(int_mul_memory<uint64_t, uint64_t> *)mem_ptr, num_blocks);
break;
case 16384:
host_integer_mult_radix<uint64_t, AmortizedDegree<16384>>(
host_integer_mult_radix<uint64_t, uint64_t, AmortizedDegree<16384>>(
CudaStreams(streams), radix_lwe_out, radix_lwe_left, is_bool_left,
radix_lwe_right, is_bool_right, bsks, (uint64_t **)(ksks),
(int_mul_memory<uint64_t> *)mem_ptr, num_blocks);
(int_mul_memory<uint64_t, uint64_t> *)mem_ptr, num_blocks);
break;
default:
PANIC("Cuda error (integer multiplication): unsupported polynomial size. "
@@ -183,8 +183,8 @@ void cuda_integer_mult_radix_ciphertext_64(
void cleanup_cuda_integer_mult(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup mul")
int_mul_memory<uint64_t> *mem_ptr =
(int_mul_memory<uint64_t> *)(*mem_ptr_void);
int_mul_memory<uint64_t, uint64_t> *mem_ptr =
(int_mul_memory<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
@@ -209,9 +209,9 @@ uint64_t scratch_cuda_partial_sum_ciphertexts_vec_64(
noise_reduction_type);
return scratch_cuda_integer_partial_sum_ciphertexts_vec<uint64_t>(
CudaStreams(streams),
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
max_num_radix_in_vec, reduce_degrees_for_single_carry_propagation, params,
allocate_gpu_memory);
(int_sum_ciphertexts_vec_memory<uint64_t, uint64_t> **)mem_ptr,
num_blocks_in_radix, max_num_radix_in_vec,
reduce_degrees_for_single_carry_propagation, params, allocate_gpu_memory);
}
void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
@@ -220,7 +220,7 @@ void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
int8_t *mem_ptr, void *const *bsks,
void *const *ksks) {
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t> *)mem_ptr;
auto mem = (int_sum_ciphertexts_vec_memory<uint64_t, uint64_t> *)mem_ptr;
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
PANIC("Cuda error: input vector length should be a multiple of the "
"output's number of radix blocks")
@@ -232,8 +232,8 @@ void cuda_partial_sum_ciphertexts_vec_64(CudaStreamsFFI streams,
void cleanup_cuda_partial_sum_ciphertexts_vec(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);
int_sum_ciphertexts_vec_memory<uint64_t, uint64_t> *mem_ptr =
(int_sum_ciphertexts_vec_memory<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -267,26 +267,27 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec(
CudaStreams streams, int_sum_ciphertexts_vec_memory<Torus> **mem_ptr,
CudaStreams streams,
int_sum_ciphertexts_vec_memory<Torus, KSTorus> **mem_ptr,
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
bool reduce_degrees_for_single_carry_propagation, int_radix_params params,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
*mem_ptr = new int_sum_ciphertexts_vec_memory<Torus, KSTorus>(
streams, params, num_blocks_in_radix, max_num_radix_in_vec,
reduce_degrees_for_single_carry_propagation, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_integer_partial_sum_ciphertexts_vec(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI *terms, void *const *bsks, uint64_t *const *ksks,
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr,
int_sum_ciphertexts_vec_memory<Torus, KSTorus> *mem_ptr,
uint32_t num_radix_blocks, uint32_t num_radix_in_vec) {
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
@@ -486,13 +487,13 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
}
}
template <typename Torus, class params>
template <typename Torus, typename KSTorus, class params>
__host__ void host_integer_mult_radix(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
CudaRadixCiphertextFFI const *radix_lwe_left, bool const is_bool_left,
CudaRadixCiphertextFFI const *radix_lwe_right, bool const is_bool_right,
void *const *bsks, uint64_t *const *ksks, int_mul_memory<Torus> *mem_ptr,
uint32_t num_blocks) {
void *const *bsks, uint64_t *const *ksks,
int_mul_memory<Torus, KSTorus> *mem_ptr, uint32_t num_blocks) {
if (radix_lwe_out->lwe_dimension != radix_lwe_left->lwe_dimension ||
radix_lwe_right->lwe_dimension != radix_lwe_left->lwe_dimension)
@@ -624,17 +625,17 @@ __host__ void host_integer_mult_radix(
uses_carry);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_integer_mult_radix_ciphertext(
CudaStreams streams, int_mul_memory<Torus> **mem_ptr,
CudaStreams streams, int_mul_memory<Torus, KSTorus> **mem_ptr,
bool const is_boolean_left, bool const is_boolean_right,
uint32_t num_radix_blocks, int_radix_params params,
bool allocate_gpu_memory) {
PUSH_RANGE("scratch mul")
uint64_t size_tracker = 0;
*mem_ptr = new int_mul_memory<Torus>(streams, params, is_boolean_left,
is_boolean_right, num_radix_blocks,
allocate_gpu_memory, size_tracker);
*mem_ptr = new int_mul_memory<Torus, KSTorus>(
streams, params, is_boolean_left, is_boolean_right, num_radix_blocks,
allocate_gpu_memory, size_tracker);
POP_RANGE()
return size_tracker;
}

View File

@@ -16,8 +16,9 @@ uint64_t scratch_cuda_integer_grouped_oprf_64(
noise_reduction_type);
return scratch_cuda_integer_grouped_oprf<uint64_t>(
CudaStreams(streams), (int_grouped_oprf_memory<uint64_t> **)mem_ptr,
params, num_blocks_to_process, message_bits_per_block, total_random_bits,
CudaStreams(streams),
(int_grouped_oprf_memory<uint64_t, uint64_t> **)mem_ptr, params,
num_blocks_to_process, message_bits_per_block, total_random_bits,
allocate_gpu_memory);
}
@@ -29,14 +30,15 @@ void cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
host_integer_grouped_oprf<uint64_t>(
CudaStreams(streams), radix_lwe_out, (const uint64_t *)seeded_lwe_input,
num_blocks_to_process, (int_grouped_oprf_memory<uint64_t> *)mem, bsks);
num_blocks_to_process, (int_grouped_oprf_memory<uint64_t, uint64_t> *)mem,
bsks);
}
void cleanup_cuda_integer_grouped_oprf_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_grouped_oprf_memory<uint64_t> *mem_ptr =
(int_grouped_oprf_memory<uint64_t> *)(*mem_ptr_void);
int_grouped_oprf_memory<uint64_t, uint64_t> *mem_ptr =
(int_grouped_oprf_memory<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -62,9 +64,9 @@ uint64_t scratch_cuda_integer_grouped_oprf_custom_range_64(
return scratch_cuda_integer_grouped_oprf_custom_range<uint64_t>(
CudaStreams(streams),
(int_grouped_oprf_custom_range_memory<uint64_t> **)mem_ptr, params,
num_blocks_intermediate, message_bits_per_block, num_input_random_bits,
num_scalar_bits, allocate_gpu_memory);
(int_grouped_oprf_custom_range_memory<uint64_t, uint64_t> **)mem_ptr,
params, num_blocks_intermediate, message_bits_per_block,
num_input_random_bits, num_scalar_bits, allocate_gpu_memory);
}
void cuda_integer_grouped_oprf_custom_range_64(
@@ -78,14 +80,15 @@ void cuda_integer_grouped_oprf_custom_range_64(
CudaStreams(streams), radix_lwe_out, num_blocks_intermediate,
(const uint64_t *)seeded_lwe_input, decomposed_scalar,
has_at_least_one_set, num_scalars, shift,
(int_grouped_oprf_custom_range_memory<uint64_t> *)mem, bsks,
(int_grouped_oprf_custom_range_memory<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_integer_grouped_oprf_custom_range_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_grouped_oprf_custom_range_memory<uint64_t> *mem_ptr =
(int_grouped_oprf_custom_range_memory<uint64_t> *)(*mem_ptr_void);
int_grouped_oprf_custom_range_memory<uint64_t, uint64_t> *mem_ptr =
(int_grouped_oprf_custom_range_memory<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));

View File

@@ -6,27 +6,27 @@
#include "integer/scalar_mul.cuh"
#include "integer/scalar_shifts.cuh"
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_integer_grouped_oprf(
CudaStreams streams, int_grouped_oprf_memory<Torus> **mem_ptr,
CudaStreams streams, int_grouped_oprf_memory<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_blocks_to_process,
uint32_t message_bits_per_block, uint64_t total_random_bits,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_grouped_oprf_memory<Torus>(
*mem_ptr = new int_grouped_oprf_memory<Torus, KSTorus>(
streams, params, num_blocks_to_process, message_bits_per_block,
total_random_bits, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
void host_integer_grouped_oprf(CudaStreams streams,
CudaRadixCiphertextFFI *radix_lwe_out,
const Torus *seeded_lwe_input,
uint32_t num_blocks_to_process,
int_grouped_oprf_memory<Torus> *mem_ptr,
int_grouped_oprf_memory<Torus, KSTorus> *mem_ptr,
void *const *bsks) {
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
@@ -90,15 +90,16 @@ void host_integer_grouped_oprf(CudaStreams streams,
mem_ptr->params.carry_modulus);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_integer_grouped_oprf_custom_range(
CudaStreams streams, int_grouped_oprf_custom_range_memory<Torus> **mem_ptr,
CudaStreams streams,
int_grouped_oprf_custom_range_memory<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_blocks_intermediate,
uint32_t message_bits_per_block, uint64_t num_input_random_bits,
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_grouped_oprf_custom_range_memory<Torus>(
*mem_ptr = new int_grouped_oprf_custom_range_memory<Torus, KSTorus>(
streams, params, num_blocks_intermediate, message_bits_per_block,
num_input_random_bits, num_scalar_bits, allocate_gpu_memory,
size_tracker);
@@ -106,14 +107,14 @@ uint64_t scratch_cuda_integer_grouped_oprf_custom_range(
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
void host_integer_grouped_oprf_custom_range(
CudaStreams streams, CudaRadixCiphertextFFI *radix_lwe_out,
uint32_t num_blocks_intermediate, const Torus *seeded_lwe_input,
const Torus *decomposed_scalar, const Torus *has_at_least_one_set,
uint32_t num_scalars, uint32_t shift,
int_grouped_oprf_custom_range_memory<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_grouped_oprf_custom_range_memory<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
CudaRadixCiphertextFFI *computation_buffer = mem_ptr->tmp_oprf_output;
set_zero_radix_ciphertext_slice_async<Torus>(

View File

@@ -10,7 +10,8 @@ void cuda_scalar_bitop_ciphertext_64(
CudaStreams(streams), lwe_array_out, lwe_array_input,
static_cast<const uint64_t *>(clear_blocks),
static_cast<const uint64_t *>(h_clear_blocks), num_clear_blocks,
(int_bitop_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
(int_bitop_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void update_degrees_after_scalar_bitand(uint64_t *output_degrees,

View File

@@ -8,8 +8,9 @@ __host__ void
host_scalar_bitop(CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
Torus const *clear_blocks, Torus const *h_clear_blocks,
uint32_t num_clear_blocks, int_bitop_buffer<Torus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks) {
uint32_t num_clear_blocks,
int_bitop_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks) {
if (output->num_radix_blocks != input->num_radix_blocks)
PANIC("Cuda error: input and output num radix blocks must be equal")

View File

@@ -41,8 +41,8 @@ void cuda_scalar_comparison_ciphertext_64(
// depending on the case (eq/gt vs max/min) so the amount of blocks to
// consider for calculation is the one of the input
auto num_radix_blocks = lwe_array_in->num_radix_blocks;
int_comparison_buffer<uint64_t> *buffer =
(int_comparison_buffer<uint64_t> *)mem_ptr;
int_comparison_buffer<uint64_t, uint64_t> *buffer =
(int_comparison_buffer<uint64_t, uint64_t> *)mem_ptr;
switch (buffer->op) {
case EQ:
case NE:

View File

@@ -28,7 +28,7 @@ template <typename Torus, typename KSTorus>
__host__ void scalar_compare_radix_blocks(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI *lwe_array_in, Torus *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks) {
if (num_radix_blocks == 0)
@@ -86,7 +86,8 @@ template <typename Torus, typename KSTorus>
__host__ void integer_radix_unsigned_scalar_difference_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
Torus const *h_scalar_blocks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
@@ -265,8 +266,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
return (Torus)(invert_flags.second ^ overflowed);
};
uint64_t size = 0;
int_radix_lut<Torus> *one_block_lut =
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
int_radix_lut<Torus, KSTorus> *one_block_lut =
new int_radix_lut<Torus, KSTorus>(streams, params, 1, 1, true, size);
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
@@ -325,7 +326,8 @@ template <typename Torus, typename KSTorus>
__host__ void integer_radix_signed_scalar_difference_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
Torus const *h_scalar_blocks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
@@ -558,8 +560,8 @@ __host__ void integer_radix_signed_scalar_difference_check(
message_modulus);
};
uint64_t size = 0;
int_radix_lut<Torus> *one_block_lut =
new int_radix_lut<Torus>(streams, params, 1, 1, true, size);
int_radix_lut<Torus, KSTorus> *one_block_lut =
new int_radix_lut<Torus, KSTorus>(streams, params, 1, 1, true, size);
generate_device_accumulator_with_cpu_prealloc<Torus>(
streams.stream(0), streams.gpu_index(0), one_block_lut->get_lut(0, 0),
@@ -644,7 +646,8 @@ template <typename Torus, typename KSTorus>
__host__ void host_scalar_difference_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
Torus const *h_scalar_blocks, int_comparison_buffer<Torus> *mem_ptr,
Torus const *h_scalar_blocks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr,
std::function<Torus(Torus)> sign_handler_f, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
@@ -674,9 +677,9 @@ __host__ void
host_scalar_maxmin(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in,
Torus const *scalar_blocks, Torus const *h_scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
int_comparison_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks,
uint32_t num_radix_blocks, uint32_t num_scalar_blocks) {
if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension)
PANIC("Cuda error: input and output lwe dimensions must be the same")
@@ -717,7 +720,7 @@ template <typename Torus, typename KSTorus>
__host__ void host_scalar_equality_check(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in, Torus const *scalar_blocks,
int_comparison_buffer<Torus> *mem_ptr, void *const *bsks,
int_comparison_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, uint32_t num_radix_blocks,
uint32_t num_scalar_blocks) {
@@ -797,7 +800,7 @@ __host__ void host_scalar_equality_check(
//////////////
// msb_in
if (num_msb_radix_blocks > 0) {
int_radix_lut<Torus> *msb_lut;
int_radix_lut<Torus, KSTorus> *msb_lut;
switch (mem_ptr->op) {
case COMPARISON_TYPE::EQ:
msb_lut = mem_ptr->is_zero_lut;

View File

@@ -17,7 +17,7 @@ uint64_t scratch_cuda_integer_unsigned_scalar_div_radix_64(
return scratch_integer_unsigned_scalar_div_radix<uint64_t>(
CudaStreams(streams), params,
(int_unsigned_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
(int_unsigned_scalar_div_mem<uint64_t, uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, allocate_gpu_memory);
}
@@ -28,15 +28,15 @@ void cuda_integer_unsigned_scalar_div_radix_64(
host_integer_unsigned_scalar_div_radix<uint64_t>(
CudaStreams(streams), numerator_ct,
(int_unsigned_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
scalar_divisor_ffi);
(int_unsigned_scalar_div_mem<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, scalar_divisor_ffi);
}
void cleanup_cuda_integer_unsigned_scalar_div_radix_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unsigned_scalar_div_mem<uint64_t> *mem_ptr =
(int_unsigned_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
int_unsigned_scalar_div_mem<uint64_t, uint64_t> *mem_ptr =
(int_unsigned_scalar_div_mem<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -59,9 +59,9 @@ uint64_t scratch_cuda_integer_signed_scalar_div_radix_64(
grouping_factor, message_modulus, carry_modulus,
noise_reduction_type);
return scratch_integer_signed_scalar_div_radix<uint64_t>(
return scratch_integer_signed_scalar_div_radix<uint64_t, uint64_t>(
CudaStreams(streams), params,
(int_signed_scalar_div_mem<uint64_t> **)mem_ptr, num_blocks,
(int_signed_scalar_div_mem<uint64_t, uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, allocate_gpu_memory);
}
@@ -70,17 +70,17 @@ void cuda_integer_signed_scalar_div_radix_64(
int8_t *mem_ptr, void *const *bsks, void *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi, uint32_t numerator_bits) {
host_integer_signed_scalar_div_radix<uint64_t>(
host_integer_signed_scalar_div_radix<uint64_t, uint64_t>(
CudaStreams(streams), numerator_ct,
(int_signed_scalar_div_mem<uint64_t> *)mem_ptr, bsks, (uint64_t **)ksks,
scalar_divisor_ffi, numerator_bits);
(int_signed_scalar_div_mem<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, scalar_divisor_ffi, numerator_bits);
}
void cleanup_cuda_integer_signed_scalar_div_radix_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_signed_scalar_div_mem<uint64_t> *mem_ptr =
(int_signed_scalar_div_mem<uint64_t> *)(*mem_ptr_void);
int_signed_scalar_div_mem<uint64_t, uint64_t> *mem_ptr =
(int_signed_scalar_div_mem<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -106,8 +106,8 @@ uint64_t scratch_integer_unsigned_scalar_div_rem_radix_64(
return scratch_integer_unsigned_scalar_div_rem_radix<uint64_t>(
CudaStreams(streams), params,
(int_unsigned_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
(int_unsigned_scalar_div_rem_buffer<uint64_t, uint64_t> **)mem_ptr,
num_blocks, scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
}
void cuda_integer_unsigned_scalar_div_rem_radix_64(
@@ -121,7 +121,7 @@ void cuda_integer_unsigned_scalar_div_rem_radix_64(
host_integer_unsigned_scalar_div_rem_radix<uint64_t>(
CudaStreams(streams), quotient_ct, remainder_ct,
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
(int_unsigned_scalar_div_rem_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, scalar_divisor_ffi, divisor_has_at_least_one_set,
decomposed_divisor, num_scalars_divisor, (uint64_t *)clear_blocks,
(uint64_t *)h_clear_blocks, num_clear_blocks);
@@ -130,8 +130,8 @@ void cuda_integer_unsigned_scalar_div_rem_radix_64(
void cleanup_cuda_integer_unsigned_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_unsigned_scalar_div_rem_buffer<uint64_t> *mem_ptr =
(int_unsigned_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
int_unsigned_scalar_div_rem_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unsigned_scalar_div_rem_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -157,8 +157,8 @@ uint64_t scratch_integer_signed_scalar_div_rem_radix_64(
return scratch_integer_signed_scalar_div_rem_radix<uint64_t>(
CudaStreams(streams), params,
(int_signed_scalar_div_rem_buffer<uint64_t> **)mem_ptr, num_blocks,
scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
(int_signed_scalar_div_rem_buffer<uint64_t, uint64_t> **)mem_ptr,
num_blocks, scalar_divisor_ffi, active_bits_divisor, allocate_gpu_memory);
}
void cuda_integer_signed_scalar_div_rem_radix_64(
@@ -171,7 +171,7 @@ void cuda_integer_signed_scalar_div_rem_radix_64(
host_integer_signed_scalar_div_rem_radix<uint64_t>(
CudaStreams(streams), quotient_ct, remainder_ct,
(int_signed_scalar_div_rem_buffer<uint64_t> *)mem_ptr, bsks,
(int_signed_scalar_div_rem_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)ksks, scalar_divisor_ffi, divisor_has_at_least_one_set,
decomposed_divisor, num_scalars_divisor, numerator_bits);
}
@@ -179,8 +179,8 @@ void cuda_integer_signed_scalar_div_rem_radix_64(
void cleanup_cuda_integer_signed_scalar_div_rem_radix_64(
CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_signed_scalar_div_rem_buffer<uint64_t> *mem_ptr =
(int_signed_scalar_div_rem_buffer<uint64_t> *)(*mem_ptr_void);
int_signed_scalar_div_rem_buffer<uint64_t, uint64_t> *mem_ptr =
(int_signed_scalar_div_rem_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));

View File

@@ -8,16 +8,16 @@
#include "integer/scalar_shifts.cuh"
#include "integer/subtraction.cuh"
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
CudaStreams streams, const int_radix_params params,
int_unsigned_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
int_unsigned_scalar_div_mem<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unsigned_scalar_div_mem<Torus>(
*mem_ptr = new int_unsigned_scalar_div_mem<Torus, KSTorus>(
streams, params, num_radix_blocks, scalar_divisor_ffi,
allocate_gpu_memory, size_tracker);
@@ -27,7 +27,7 @@ __host__ uint64_t scratch_integer_unsigned_scalar_div_radix(
template <typename Torus, typename KSTorus>
__host__ void host_integer_unsigned_scalar_div_radix(
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
int_unsigned_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
int_unsigned_scalar_div_mem<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi) {
if (scalar_divisor_ffi->is_abs_divisor_one) {
@@ -75,7 +75,7 @@ __host__ void host_integer_unsigned_scalar_div_radix(
streams, numerator_ct, (uint32_t)1, mem_ptr->logical_scalar_shift_mem,
bsks, ksks, numerator_ct->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
host_add_and_propagate_single_carry<Torus, KSTorus>(
streams, numerator_ct, numerator_cpy, nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, FLAG_NONE, (uint32_t)0);
@@ -102,16 +102,16 @@ __host__ void host_integer_unsigned_scalar_div_radix(
numerator_ct->num_radix_blocks);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_integer_signed_scalar_div_radix(
CudaStreams streams, int_radix_params params,
int_signed_scalar_div_mem<Torus> **mem_ptr, uint32_t num_radix_blocks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
int_signed_scalar_div_mem<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_signed_scalar_div_mem<Torus>(
*mem_ptr = new int_signed_scalar_div_mem<Torus, KSTorus>(
streams, params, num_radix_blocks, scalar_divisor_ffi,
allocate_gpu_memory, size_tracker);
@@ -121,7 +121,7 @@ __host__ uint64_t scratch_integer_signed_scalar_div_radix(
template <typename Torus, typename KSTorus>
__host__ void host_integer_signed_scalar_div_radix(
CudaStreams streams, CudaRadixCiphertextFFI *numerator_ct,
int_signed_scalar_div_mem<Torus> *mem_ptr, void *const *bsks,
int_signed_scalar_div_mem<Torus, KSTorus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t numerator_bits) {
@@ -161,7 +161,7 @@ __host__ void host_integer_signed_scalar_div_radix(
numerator_bits - scalar_divisor_ffi->chosen_multiplier_num_bits,
mem_ptr->logical_scalar_shift_mem, bsks, ksks, tmp->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
host_add_and_propagate_single_carry<Torus, KSTorus>(
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, FLAG_NONE, (uint32_t)0);
@@ -202,7 +202,7 @@ __host__ void host_integer_signed_scalar_div_radix(
mem_ptr->scalar_mul_high_mem, ksks,
scalar_divisor_ffi, bsks);
host_add_and_propagate_single_carry<Torus>(
host_add_and_propagate_single_carry<Torus, KSTorus>(
streams, tmp, numerator_ct, nullptr, nullptr, mem_ptr->scp_mem, bsks,
ksks, FLAG_NONE, (uint32_t)0);
@@ -233,15 +233,15 @@ __host__ void host_integer_signed_scalar_div_radix(
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_integer_unsigned_scalar_div_rem_radix(
CudaStreams streams, const int_radix_params params,
int_unsigned_scalar_div_rem_buffer<Torus> **mem_ptr,
int_unsigned_scalar_div_rem_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus>(
*mem_ptr = new int_unsigned_scalar_div_rem_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, scalar_divisor_ffi,
active_bits_divisor, allocate_gpu_memory, size_tracker);
return size_tracker;
@@ -251,8 +251,9 @@ template <typename Torus, typename KSTorus>
__host__ void host_integer_unsigned_scalar_div_rem_radix(
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct,
int_unsigned_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
int_unsigned_scalar_div_rem_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
Torus const *clear_blocks, Torus const *h_clear_blocks,
@@ -298,16 +299,16 @@ __host__ void host_integer_unsigned_scalar_div_rem_radix(
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_integer_signed_scalar_div_rem_radix(
CudaStreams streams, const int_radix_params params,
int_signed_scalar_div_rem_buffer<Torus> **mem_ptr,
int_signed_scalar_div_rem_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint32_t const active_bits_divisor, const bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_signed_scalar_div_rem_buffer<Torus>(
*mem_ptr = new int_signed_scalar_div_rem_buffer<Torus, KSTorus>(
streams, params, num_radix_blocks, scalar_divisor_ffi,
active_bits_divisor, allocate_gpu_memory, size_tracker);
@@ -318,8 +319,9 @@ template <typename Torus, typename KSTorus>
__host__ void host_integer_signed_scalar_div_rem_radix(
CudaStreams streams, CudaRadixCiphertextFFI *quotient_ct,
CudaRadixCiphertextFFI *remainder_ct,
int_signed_scalar_div_rem_buffer<Torus> *mem_ptr, void *const *bsks,
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
int_signed_scalar_div_rem_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, KSTorus *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi,
uint64_t const *divisor_has_at_least_one_set,
uint64_t const *decomposed_divisor, uint32_t const num_scalars_divisor,
uint32_t numerator_bits) {

View File

@@ -15,8 +15,9 @@ uint64_t scratch_cuda_integer_scalar_mul_64(
noise_reduction_type);
return scratch_cuda_scalar_mul<uint64_t>(
CudaStreams(streams), (int_scalar_mul_buffer<uint64_t> **)mem_ptr,
num_blocks, params, num_scalar_bits, allocate_gpu_memory);
CudaStreams(streams),
(int_scalar_mul_buffer<uint64_t, uint64_t> **)mem_ptr, num_blocks, params,
num_scalar_bits, allocate_gpu_memory);
}
void cuda_scalar_multiplication_ciphertext_64_inplace(
@@ -27,14 +28,14 @@ void cuda_scalar_multiplication_ciphertext_64_inplace(
host_integer_scalar_mul_radix<uint64_t>(
CudaStreams(streams), lwe_array, decomposed_scalar, has_at_least_one_set,
reinterpret_cast<int_scalar_mul_buffer<uint64_t> *>(mem), bsks,
reinterpret_cast<int_scalar_mul_buffer<uint64_t, uint64_t> *>(mem), bsks,
(uint64_t **)(ksks), message_modulus, num_scalars);
}
void cleanup_cuda_scalar_mul(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_scalar_mul_buffer<uint64_t> *mem_ptr =
(int_scalar_mul_buffer<uint64_t> *)(*mem_ptr_void);
int_scalar_mul_buffer<uint64_t, uint64_t> *mem_ptr =
(int_scalar_mul_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -29,18 +29,16 @@ __global__ void device_small_scalar_radix_multiplication(T *output_lwe_array,
}
}
template <typename T>
__host__ uint64_t scratch_cuda_scalar_mul(CudaStreams streams,
int_scalar_mul_buffer<T> **mem_ptr,
uint32_t num_radix_blocks,
int_radix_params params,
uint32_t num_scalar_bits,
bool allocate_gpu_memory) {
template <typename T, typename KSTorus>
__host__ uint64_t scratch_cuda_scalar_mul(
CudaStreams streams, int_scalar_mul_buffer<T, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
uint32_t num_scalar_bits, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_scalar_mul_buffer<T>(streams, params, num_radix_blocks,
num_scalar_bits, allocate_gpu_memory,
true, size_tracker);
*mem_ptr = new int_scalar_mul_buffer<T, KSTorus>(
streams, params, num_radix_blocks, num_scalar_bits, allocate_gpu_memory,
true, size_tracker);
return size_tracker;
}
@@ -48,8 +46,8 @@ template <typename T, typename KSTorus>
__host__ void host_integer_scalar_mul_radix(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array,
T const *decomposed_scalar, T const *has_at_least_one_set,
int_scalar_mul_buffer<T> *mem, void *const *bsks, KSTorus *const *ksks,
uint32_t message_modulus, uint32_t num_scalars) {
int_scalar_mul_buffer<T, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks, uint32_t message_modulus, uint32_t num_scalars) {
auto num_radix_blocks = lwe_array->num_radix_blocks;
// lwe_size includes the presence of the body
@@ -170,7 +168,7 @@ __host__ void host_integer_small_scalar_mul_radix(
template <typename Torus, typename KSTorus>
__host__ void
host_scalar_mul_high(CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_scalar_mul_high_buffer<Torus> *mem_ptr,
int_scalar_mul_high_buffer<Torus, KSTorus> *mem_ptr,
KSTorus *const *ksks, void *const *bsks,
const CudaScalarDivisorFFI *scalar_divisor_ffi) {
@@ -210,8 +208,9 @@ host_scalar_mul_high(CudaStreams streams, CudaRadixCiphertextFFI *ct,
template <typename Torus, typename KSTorus>
__host__ void host_signed_scalar_mul_high(
CudaStreams streams, CudaRadixCiphertextFFI *ct,
int_signed_scalar_mul_high_buffer<Torus> *mem_ptr, KSTorus *const *ksks,
const CudaScalarDivisorFFI *scalar_divisor_ffi, void *const *bsks) {
int_signed_scalar_mul_high_buffer<Torus, KSTorus> *mem_ptr,
KSTorus *const *ksks, const CudaScalarDivisorFFI *scalar_divisor_ffi,
void *const *bsks) {
if (scalar_divisor_ffi->is_chosen_multiplier_zero) {
set_zero_radix_ciphertext_slice_async<Torus>(

View File

@@ -16,8 +16,8 @@ uint64_t scratch_cuda_scalar_rotate_64(
return scratch_cuda_scalar_rotate<uint64_t>(
CudaStreams(streams),
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
(int_logical_scalar_shift_buffer<uint64_t, uint64_t> **)mem_ptr,
num_blocks, params, shift_type, allocate_gpu_memory);
}
void cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
@@ -27,14 +27,14 @@ void cuda_scalar_rotate_64_inplace(CudaStreamsFFI streams,
host_scalar_rotate_inplace<uint64_t>(
CudaStreams(streams), lwe_array, n,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(int_logical_scalar_shift_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_scalar_rotate(CudaStreamsFFI streams, int8_t **mem_ptr_void) {
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
int_logical_scalar_shift_buffer<uint64_t, uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -8,14 +8,15 @@
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_scalar_rotate(
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_logical_scalar_shift_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
*mem_ptr = new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
@@ -25,7 +26,7 @@ template <typename Torus, typename KSTorus>
__host__ void
host_scalar_rotate_inplace(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array, uint32_t n,
int_logical_scalar_shift_buffer<Torus> *mem,
int_logical_scalar_shift_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
auto num_blocks = lwe_array->num_radix_blocks;

View File

@@ -16,8 +16,8 @@ uint64_t scratch_cuda_logical_scalar_shift_64(
return scratch_cuda_logical_scalar_shift<uint64_t>(
CudaStreams(streams),
(int_logical_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks, params,
shift_type, allocate_gpu_memory);
(int_logical_scalar_shift_buffer<uint64_t, uint64_t> **)mem_ptr,
num_blocks, params, shift_type, allocate_gpu_memory);
}
/// The logical scalar shift is the one used for unsigned integers, and
@@ -32,7 +32,7 @@ void cuda_logical_scalar_shift_64_inplace(CudaStreamsFFI streams,
host_logical_scalar_shift_inplace<uint64_t>(
CudaStreams(streams), lwe_array, shift,
(int_logical_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(int_logical_scalar_shift_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), lwe_array->num_radix_blocks);
}
@@ -52,8 +52,8 @@ uint64_t scratch_cuda_arithmetic_scalar_shift_64(
return scratch_cuda_arithmetic_scalar_shift<uint64_t>(
CudaStreams(streams),
(int_arithmetic_scalar_shift_buffer<uint64_t> **)mem_ptr, num_blocks,
params, shift_type, allocate_gpu_memory);
(int_arithmetic_scalar_shift_buffer<uint64_t, uint64_t> **)mem_ptr,
num_blocks, params, shift_type, allocate_gpu_memory);
}
/// The arithmetic scalar shift is the one used for the signed right shift.
@@ -71,15 +71,15 @@ void cuda_arithmetic_scalar_shift_64_inplace(CudaStreamsFFI streams,
host_arithmetic_scalar_shift_inplace<uint64_t>(
CudaStreams(streams), lwe_array, shift,
(int_arithmetic_scalar_shift_buffer<uint64_t> *)mem_ptr, bsks,
(int_arithmetic_scalar_shift_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_logical_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_logical_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
int_logical_scalar_shift_buffer<uint64_t, uint64_t> *mem_ptr =
(int_logical_scalar_shift_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
@@ -89,8 +89,8 @@ void cleanup_cuda_logical_scalar_shift(CudaStreamsFFI streams,
void cleanup_cuda_arithmetic_scalar_shift(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_arithmetic_scalar_shift_buffer<uint64_t> *mem_ptr =
(int_arithmetic_scalar_shift_buffer<uint64_t> *)(*mem_ptr_void);
int_arithmetic_scalar_shift_buffer<uint64_t, uint64_t> *mem_ptr =
(int_arithmetic_scalar_shift_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -9,14 +9,15 @@
#include "pbs/programmable_bootstrap_classic.cuh"
#include "pbs/programmable_bootstrap_multibit.cuh"
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_logical_scalar_shift(
CudaStreams streams, int_logical_scalar_shift_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_logical_scalar_shift_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_logical_scalar_shift_buffer<Torus>(
*mem_ptr = new int_logical_scalar_shift_buffer<Torus, KSTorus>(
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
@@ -25,7 +26,7 @@ __host__ uint64_t scratch_cuda_logical_scalar_shift(
template <typename Torus, typename KSTorus>
__host__ void host_logical_scalar_shift_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int_logical_scalar_shift_buffer<Torus> *mem, void *const *bsks,
int_logical_scalar_shift_buffer<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks, uint32_t num_blocks) {
if (lwe_array->num_radix_blocks < num_blocks)
@@ -113,14 +114,15 @@ __host__ void host_logical_scalar_shift_inplace(
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_arithmetic_scalar_shift(
CudaStreams streams, int_arithmetic_scalar_shift_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_arithmetic_scalar_shift_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus>(
*mem_ptr = new int_arithmetic_scalar_shift_buffer<Torus, KSTorus>(
streams, shift_type, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
@@ -129,7 +131,7 @@ __host__ uint64_t scratch_cuda_arithmetic_scalar_shift(
template <typename Torus, typename KSTorus>
__host__ void host_arithmetic_scalar_shift_inplace(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array, uint32_t shift,
int_arithmetic_scalar_shift_buffer<Torus> *mem, void *const *bsks,
int_arithmetic_scalar_shift_buffer<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks) {
auto num_blocks = lwe_array->num_radix_blocks;

View File

@@ -15,8 +15,9 @@ uint64_t scratch_cuda_shift_and_rotate_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_shift_and_rotate<uint64_t>(
CudaStreams(streams), (int_shift_and_rotate_buffer<uint64_t> **)mem_ptr,
num_blocks, params, shift_type, is_signed, allocate_gpu_memory);
CudaStreams(streams),
(int_shift_and_rotate_buffer<uint64_t, uint64_t> **)mem_ptr, num_blocks,
params, shift_type, is_signed, allocate_gpu_memory);
}
void cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
@@ -27,14 +28,14 @@ void cuda_shift_and_rotate_64_inplace(CudaStreamsFFI streams,
host_shift_and_rotate_inplace<uint64_t>(
CudaStreams(streams), lwe_array, lwe_shift,
(int_shift_and_rotate_buffer<uint64_t> *)mem_ptr, bsks,
(int_shift_and_rotate_buffer<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks));
}
void cleanup_cuda_shift_and_rotate(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_shift_and_rotate_buffer<uint64_t> *mem_ptr =
(int_shift_and_rotate_buffer<uint64_t> *)(*mem_ptr_void);
int_shift_and_rotate_buffer<uint64_t, uint64_t> *mem_ptr =
(int_shift_and_rotate_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;

View File

@@ -10,13 +10,13 @@
#include "pbs/programmable_bootstrap_multibit.cuh"
#include "scalar_mul.cuh"
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_shift_and_rotate(
CudaStreams streams, int_shift_and_rotate_buffer<Torus> **mem_ptr,
CudaStreams streams, int_shift_and_rotate_buffer<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params,
SHIFT_OR_ROTATE_TYPE shift_type, bool is_signed, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_shift_and_rotate_buffer<Torus>(
*mem_ptr = new int_shift_and_rotate_buffer<Torus, KSTorus>(
streams, shift_type, is_signed, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
return size_tracker;
@@ -27,7 +27,7 @@ __host__ void
host_shift_and_rotate_inplace(CudaStreams streams,
CudaRadixCiphertextFFI *lwe_array,
CudaRadixCiphertextFFI const *lwe_shift,
int_shift_and_rotate_buffer<Torus> *mem,
int_shift_and_rotate_buffer<Torus, KSTorus> *mem,
void *const *bsks, KSTorus *const *ksks) {
cuda_set_device(streams.gpu_index(0));

View File

@@ -15,8 +15,9 @@ uint64_t scratch_cuda_sub_and_propagate_single_carry_64_inplace(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_sub_and_propagate_single_carry<uint64_t>(
CudaStreams(streams), (int_sub_and_propagate<uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, allocate_gpu_memory);
CudaStreams(streams),
(int_sub_and_propagate<uint64_t, uint64_t> **)mem_ptr, num_blocks, params,
requested_flag, allocate_gpu_memory);
}
void cuda_sub_and_propagate_single_carry_64_inplace(
@@ -27,16 +28,16 @@ void cuda_sub_and_propagate_single_carry_64_inplace(
PUSH_RANGE("sub")
host_sub_and_propagate_single_carry<uint64_t>(
CudaStreams(streams), lhs_array, rhs_array, carry_out, carry_in,
(int_sub_and_propagate<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
requested_flag, uses_carry);
(int_sub_and_propagate<uint64_t, uint64_t> *)mem_ptr, bsks,
(uint64_t **)(ksks), requested_flag, uses_carry);
POP_RANGE()
}
void cleanup_cuda_sub_and_propagate_single_carry(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
PUSH_RANGE("cleanup sub")
int_sub_and_propagate<uint64_t> *mem_ptr =
(int_sub_and_propagate<uint64_t> *)(*mem_ptr_void);
int_sub_and_propagate<uint64_t, uint64_t> *mem_ptr =
(int_sub_and_propagate<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
POP_RANGE()

View File

@@ -13,15 +13,15 @@
#include "negation.cuh"
#include "pbs/pbs_enums.h"
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_sub_and_propagate_single_carry(
CudaStreams streams, int_sub_and_propagate<Torus> **mem_ptr,
CudaStreams streams, int_sub_and_propagate<Torus, KSTorus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
bool allocate_gpu_memory) {
PUSH_RANGE("scratch sub")
uint64_t size_tracker = 0;
*mem_ptr = new int_sub_and_propagate<Torus>(
*mem_ptr = new int_sub_and_propagate<Torus, KSTorus>(
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
POP_RANGE()
@@ -33,14 +33,14 @@ void host_sub_and_propagate_single_carry(
CudaStreams streams, CudaRadixCiphertextFFI *lhs_array,
const CudaRadixCiphertextFFI *rhs_array, CudaRadixCiphertextFFI *carry_out,
const CudaRadixCiphertextFFI *input_carries,
int_sub_and_propagate<Torus> *mem, void *const *bsks, KSTorus *const *ksks,
uint32_t requested_flag, uint32_t uses_carry) {
int_sub_and_propagate<Torus, KSTorus> *mem, void *const *bsks,
KSTorus *const *ksks, uint32_t requested_flag, uint32_t uses_carry) {
host_negation<Torus>(streams, mem->neg_rhs_array, rhs_array,
mem->params.message_modulus, mem->params.carry_modulus,
mem->neg_rhs_array->num_radix_blocks);
host_add_and_propagate_single_carry<Torus>(
host_add_and_propagate_single_carry<Torus, KSTorus>(
streams, lhs_array, mem->neg_rhs_array, carry_out, input_carries,
mem->sc_prop_mem, bsks, ksks, requested_flag, uses_carry);
}
@@ -72,29 +72,29 @@ __host__ void host_subtraction(CudaStreams streams,
message_modulus, carry_modulus);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_integer_overflowing_sub(
CudaStreams streams, int_overflowing_sub_memory<Torus> **mem_ptr,
CudaStreams streams, int_overflowing_sub_memory<Torus, KSTorus> **mem_ptr,
uint32_t num_blocks, int_radix_params params, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
PUSH_RANGE("scratch overflowing sub")
uint64_t size_tracker = 0;
*mem_ptr = new int_overflowing_sub_memory<Torus>(
*mem_ptr = new int_overflowing_sub_memory<Torus, KSTorus>(
streams, params, num_blocks, allocate_gpu_memory, noise_reduction_type,
size_tracker);
POP_RANGE()
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_integer_overflowing_sub(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI *input_left,
const CudaRadixCiphertextFFI *input_right,
CudaRadixCiphertextFFI *overflow_block,
const CudaRadixCiphertextFFI *input_borrow,
int_borrow_prop_memory<uint64_t> *mem_ptr, void *const *bsks,
int_borrow_prop_memory<Torus, KSTorus> *mem_ptr, void *const *bsks,
Torus *const *ksks, uint32_t compute_overflow, uint32_t uses_input_borrow) {
PUSH_RANGE("overflowing sub")
if (output->num_radix_blocks != input_left->num_radix_blocks ||
@@ -124,7 +124,7 @@ __host__ void host_integer_overflowing_sub(
host_single_borrow_propagate<Torus>(
streams, output, overflow_block, input_borrow,
(int_borrow_prop_memory<Torus> *)mem_ptr, bsks, (Torus **)(ksks),
(int_borrow_prop_memory<Torus, KSTorus> *)mem_ptr, bsks, (Torus **)(ksks),
num_groups, compute_overflow, uses_input_borrow);
POP_RANGE()
}

View File

@@ -16,8 +16,8 @@ uint64_t scratch_cuda_unchecked_all_eq_slices_64(
return scratch_cuda_unchecked_all_eq_slices<uint64_t>(
CudaStreams(streams),
(int_unchecked_all_eq_slices_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, allocate_gpu_memory);
(int_unchecked_all_eq_slices_buffer<uint64_t, uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, allocate_gpu_memory);
}
void cuda_unchecked_all_eq_slices_64(CudaStreamsFFI streams,
@@ -28,16 +28,16 @@ void cuda_unchecked_all_eq_slices_64(CudaStreamsFFI streams,
int8_t *mem, void *const *bsks,
void *const *ksks) {
host_unchecked_all_eq_slices<uint64_t>(
host_unchecked_all_eq_slices<uint64_t, uint64_t>(
CudaStreams(streams), match_ct, lhs, rhs, num_inputs, num_blocks,
(int_unchecked_all_eq_slices_buffer<uint64_t> *)mem, bsks,
(int_unchecked_all_eq_slices_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_all_eq_slices_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_all_eq_slices_buffer<uint64_t> *mem_ptr =
(int_unchecked_all_eq_slices_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_all_eq_slices_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_all_eq_slices_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -61,8 +61,8 @@ uint64_t scratch_cuda_unchecked_contains_sub_slice_64(
return scratch_cuda_unchecked_contains_sub_slice<uint64_t>(
CudaStreams(streams),
(int_unchecked_contains_sub_slice_buffer<uint64_t> **)mem_ptr, params,
num_lhs, num_rhs, num_blocks, allocate_gpu_memory);
(int_unchecked_contains_sub_slice_buffer<uint64_t, uint64_t> **)mem_ptr,
params, num_lhs, num_rhs, num_blocks, allocate_gpu_memory);
}
void cuda_unchecked_contains_sub_slice_64(CudaStreamsFFI streams,
@@ -75,14 +75,15 @@ void cuda_unchecked_contains_sub_slice_64(CudaStreamsFFI streams,
host_unchecked_contains_sub_slice<uint64_t>(
CudaStreams(streams), match_ct, lhs, rhs, num_rhs, num_blocks,
(int_unchecked_contains_sub_slice_buffer<uint64_t> *)mem, bsks,
(int_unchecked_contains_sub_slice_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_contains_sub_slice_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_contains_sub_slice_buffer<uint64_t> *mem_ptr =
(int_unchecked_contains_sub_slice_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_contains_sub_slice_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_contains_sub_slice_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));

View File

@@ -4,27 +4,28 @@
#include "integer/radix_ciphertext.cuh"
#include "integer/vector_comparison.h"
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_all_eq_slices(
CudaStreams streams, int_unchecked_all_eq_slices_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_unchecked_all_eq_slices_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_all_eq_slices_buffer<Torus>(
*mem_ptr = new int_unchecked_all_eq_slices_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_all_eq_slices(
CudaStreams streams, CudaRadixCiphertextFFI *match_ct,
CudaRadixCiphertextFFI const *lhs, CudaRadixCiphertextFFI const *rhs,
uint32_t num_inputs, uint32_t num_blocks,
int_unchecked_all_eq_slices_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_unchecked_all_eq_slices_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
// sync_from(streams)
//
@@ -82,28 +83,28 @@ __host__ void host_unchecked_all_eq_slices(
bsks, ksks, num_inputs);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_contains_sub_slice(
CudaStreams streams,
int_unchecked_contains_sub_slice_buffer<Torus> **mem_ptr,
int_unchecked_contains_sub_slice_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_lhs, uint32_t num_rhs,
uint32_t num_blocks, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_contains_sub_slice_buffer<Torus>(
*mem_ptr = new int_unchecked_contains_sub_slice_buffer<Torus, KSTorus>(
streams, params, num_lhs, num_rhs, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_contains_sub_slice(
CudaStreams streams, CudaRadixCiphertextFFI *match_ct,
CudaRadixCiphertextFFI const *lhs, CudaRadixCiphertextFFI const *rhs,
uint32_t num_rhs, uint32_t num_blocks,
int_unchecked_contains_sub_slice_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_unchecked_contains_sub_slice_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
uint32_t num_windows = mem_ptr->num_windows;
@@ -114,12 +115,12 @@ __host__ void host_unchecked_contains_sub_slice(
as_radix_ciphertext_slice<Torus>(&current_result_dest,
mem_ptr->packed_results, w, w + 1);
host_unchecked_all_eq_slices<Torus>(streams, &current_result_dest,
lhs_window, rhs, num_rhs, num_blocks,
mem_ptr->all_eq_buffer, bsks, ksks);
host_unchecked_all_eq_slices<Torus, KSTorus>(
streams, &current_result_dest, lhs_window, rhs, num_rhs, num_blocks,
mem_ptr->all_eq_buffer, bsks, ksks);
}
host_integer_is_at_least_one_comparisons_block_true<Torus>(
host_integer_is_at_least_one_comparisons_block_true<Torus, KSTorus>(
streams, match_ct, mem_ptr->packed_results,
mem_ptr->final_reduction_buffer, bsks, ksks, num_windows);
}

View File

@@ -16,8 +16,9 @@ uint64_t scratch_cuda_unchecked_match_value_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_match_value<uint64_t>(
CudaStreams(streams), (int_unchecked_match_buffer<uint64_t> **)mem_ptr,
params, num_matches, num_input_blocks, num_output_packed_blocks,
CudaStreams(streams),
(int_unchecked_match_buffer<uint64_t, uint64_t> **)mem_ptr, params,
num_matches, num_input_blocks, num_output_packed_blocks,
max_output_is_zero, allocate_gpu_memory);
}
@@ -31,14 +32,14 @@ void cuda_unchecked_match_value_64(
host_unchecked_match_value<uint64_t>(
CudaStreams(streams), lwe_array_out_result, lwe_array_out_boolean,
lwe_array_in_ct, h_match_inputs, h_match_outputs,
(int_unchecked_match_buffer<uint64_t> *)mem, bsks,
(int_unchecked_match_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_match_value_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_match_buffer<uint64_t> *mem_ptr =
(int_unchecked_match_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_match_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_match_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -64,9 +65,9 @@ uint64_t scratch_cuda_unchecked_match_value_or_64(
return scratch_cuda_unchecked_match_value_or<uint64_t>(
CudaStreams(streams),
(int_unchecked_match_value_or_buffer<uint64_t> **)mem_ptr, params,
num_matches, num_input_blocks, num_match_packed_blocks, num_final_blocks,
max_output_is_zero, allocate_gpu_memory);
(int_unchecked_match_value_or_buffer<uint64_t, uint64_t> **)mem_ptr,
params, num_matches, num_input_blocks, num_match_packed_blocks,
num_final_blocks, max_output_is_zero, allocate_gpu_memory);
}
void cuda_unchecked_match_value_or_64(
@@ -79,14 +80,15 @@ void cuda_unchecked_match_value_or_64(
host_unchecked_match_value_or<uint64_t>(
CudaStreams(streams), lwe_array_out, lwe_array_in_ct, h_match_inputs,
h_match_outputs, h_or_value,
(int_unchecked_match_value_or_buffer<uint64_t> *)mem, bsks,
(int_unchecked_match_value_or_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_match_value_or_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_match_value_or_buffer<uint64_t> *mem_ptr =
(int_unchecked_match_value_or_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_match_value_or_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_match_value_or_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -109,8 +111,9 @@ uint64_t scratch_cuda_unchecked_contains_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_contains<uint64_t>(
CudaStreams(streams), (int_unchecked_contains_buffer<uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, allocate_gpu_memory);
CudaStreams(streams),
(int_unchecked_contains_buffer<uint64_t, uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, allocate_gpu_memory);
}
void cuda_unchecked_contains_64(CudaStreamsFFI streams,
@@ -123,14 +126,14 @@ void cuda_unchecked_contains_64(CudaStreamsFFI streams,
host_unchecked_contains<uint64_t>(
CudaStreams(streams), output, inputs, value, num_inputs, num_blocks,
(int_unchecked_contains_buffer<uint64_t> *)mem, bsks,
(int_unchecked_contains_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_contains_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_contains_buffer<uint64_t> *mem_ptr =
(int_unchecked_contains_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_contains_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_contains_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -154,8 +157,8 @@ uint64_t scratch_cuda_unchecked_contains_clear_64(
return scratch_cuda_unchecked_contains_clear<uint64_t>(
CudaStreams(streams),
(int_unchecked_contains_clear_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, allocate_gpu_memory);
(int_unchecked_contains_clear_buffer<uint64_t, uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, allocate_gpu_memory);
}
void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
@@ -168,14 +171,15 @@ void cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
host_unchecked_contains_clear<uint64_t>(
CudaStreams(streams), output, inputs, h_clear_val, num_inputs, num_blocks,
(int_unchecked_contains_clear_buffer<uint64_t> *)mem, bsks,
(int_unchecked_contains_clear_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_contains_clear_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_contains_clear_buffer<uint64_t> *mem_ptr =
(int_unchecked_contains_clear_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_contains_clear_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_contains_clear_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -199,7 +203,7 @@ uint64_t scratch_cuda_unchecked_is_in_clears_64(
return scratch_cuda_unchecked_is_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_is_in_clears_buffer<uint64_t> **)mem_ptr, params,
(int_unchecked_is_in_clears_buffer<uint64_t, uint64_t> **)mem_ptr, params,
num_clears, num_blocks, allocate_gpu_memory);
}
@@ -213,14 +217,14 @@ void cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
host_unchecked_is_in_clears<uint64_t>(
CudaStreams(streams), output, input, h_cleartexts, num_clears, num_blocks,
(int_unchecked_is_in_clears_buffer<uint64_t> *)mem, bsks,
(int_unchecked_is_in_clears_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_is_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_is_in_clears_buffer<uint64_t> *mem_ptr =
(int_unchecked_is_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_is_in_clears_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_is_in_clears_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -244,8 +248,8 @@ uint64_t scratch_cuda_unchecked_index_in_clears_64(
return scratch_cuda_unchecked_index_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
num_clears, num_blocks, num_blocks_index, allocate_gpu_memory);
(int_unchecked_index_in_clears_buffer<uint64_t, uint64_t> **)mem_ptr,
params, num_clears, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
@@ -260,14 +264,15 @@ void cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
host_unchecked_index_in_clears<uint64_t>(
CudaStreams(streams), index_ct, match_ct, input, h_cleartexts, num_clears,
num_blocks, num_blocks_index,
(int_unchecked_index_in_clears_buffer<uint64_t> *)mem, bsks,
(int_unchecked_index_in_clears_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_index_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_index_in_clears_buffer<uint64_t> *mem_ptr =
(int_unchecked_index_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_index_in_clears_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_index_in_clears_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -291,8 +296,9 @@ uint64_t scratch_cuda_unchecked_first_index_in_clears_64(
return scratch_cuda_unchecked_first_index_in_clears<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_in_clears_buffer<uint64_t> **)mem_ptr, params,
num_unique, num_blocks, num_blocks_index, allocate_gpu_memory);
(int_unchecked_first_index_in_clears_buffer<uint64_t, uint64_t> **)
mem_ptr,
params, num_unique, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_first_index_in_clears_64(
@@ -305,14 +311,15 @@ void cuda_unchecked_first_index_in_clears_64(
host_unchecked_first_index_in_clears<uint64_t>(
CudaStreams(streams), index_ct, match_ct, input, h_unique_values,
h_unique_indices, num_unique, num_blocks, num_blocks_index,
(int_unchecked_first_index_in_clears_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
(int_unchecked_first_index_in_clears_buffer<uint64_t, uint64_t> *)mem,
bsks, (uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_first_index_in_clears_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_first_index_in_clears_buffer<uint64_t> *mem_ptr =
(int_unchecked_first_index_in_clears_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_first_index_in_clears_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_first_index_in_clears_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -336,8 +343,8 @@ uint64_t scratch_cuda_unchecked_first_index_of_clear_64(
return scratch_cuda_unchecked_first_index_of_clear<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
(int_unchecked_first_index_of_clear_buffer<uint64_t, uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_first_index_of_clear_64(
@@ -350,14 +357,15 @@ void cuda_unchecked_first_index_of_clear_64(
host_unchecked_first_index_of_clear<uint64_t>(
CudaStreams(streams), index_ct, match_ct, inputs, h_clear_val, num_inputs,
num_blocks, num_blocks_index,
(int_unchecked_first_index_of_clear_buffer<uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
(int_unchecked_first_index_of_clear_buffer<uint64_t, uint64_t> *)mem,
bsks, (uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_first_index_of_clear_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_first_index_of_clear_buffer<uint64_t> *mem_ptr =
(int_unchecked_first_index_of_clear_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_first_index_of_clear_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_first_index_of_clear_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -381,8 +389,8 @@ uint64_t scratch_cuda_unchecked_first_index_of_64(
return scratch_cuda_unchecked_first_index_of<uint64_t>(
CudaStreams(streams),
(int_unchecked_first_index_of_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
(int_unchecked_first_index_of_buffer<uint64_t, uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
@@ -397,14 +405,15 @@ void cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
host_unchecked_first_index_of<uint64_t>(
CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
num_blocks, num_blocks_index,
(int_unchecked_first_index_of_buffer<uint64_t> *)mem, bsks,
(int_unchecked_first_index_of_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_first_index_of_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_first_index_of_buffer<uint64_t> *mem_ptr =
(int_unchecked_first_index_of_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_first_index_of_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_first_index_of_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -427,8 +436,9 @@ uint64_t scratch_cuda_unchecked_index_of_64(
message_modulus, carry_modulus, noise_reduction_type);
return scratch_cuda_unchecked_index_of<uint64_t>(
CudaStreams(streams), (int_unchecked_index_of_buffer<uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
CudaStreams(streams),
(int_unchecked_index_of_buffer<uint64_t, uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_index_of_64(CudaStreamsFFI streams,
@@ -443,14 +453,14 @@ void cuda_unchecked_index_of_64(CudaStreamsFFI streams,
host_unchecked_index_of<uint64_t>(
CudaStreams(streams), index_ct, match_ct, inputs, value, num_inputs,
num_blocks, num_blocks_index,
(int_unchecked_index_of_buffer<uint64_t> *)mem, bsks,
(int_unchecked_index_of_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_index_of_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_index_of_buffer<uint64_t> *mem_ptr =
(int_unchecked_index_of_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_index_of_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_index_of_buffer<uint64_t, uint64_t> *)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
@@ -474,8 +484,8 @@ uint64_t scratch_cuda_unchecked_index_of_clear_64(
return scratch_cuda_unchecked_index_of_clear<uint64_t>(
CudaStreams(streams),
(int_unchecked_index_of_clear_buffer<uint64_t> **)mem_ptr, params,
num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
(int_unchecked_index_of_clear_buffer<uint64_t, uint64_t> **)mem_ptr,
params, num_inputs, num_blocks, num_blocks_index, allocate_gpu_memory);
}
void cuda_unchecked_index_of_clear_64(
@@ -490,14 +500,15 @@ void cuda_unchecked_index_of_clear_64(
CudaStreams(streams), index_ct, match_ct, inputs,
(const uint64_t *)d_scalar_blocks, is_scalar_obviously_bigger, num_inputs,
num_blocks, num_scalar_blocks, num_blocks_index,
(int_unchecked_index_of_clear_buffer<uint64_t> *)mem, bsks,
(int_unchecked_index_of_clear_buffer<uint64_t, uint64_t> *)mem, bsks,
(uint64_t *const *)ksks);
}
void cleanup_cuda_unchecked_index_of_clear_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
int_unchecked_index_of_clear_buffer<uint64_t> *mem_ptr =
(int_unchecked_index_of_clear_buffer<uint64_t> *)(*mem_ptr_void);
int_unchecked_index_of_clear_buffer<uint64_t, uint64_t> *mem_ptr =
(int_unchecked_index_of_clear_buffer<uint64_t, uint64_t>
*)(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));

View File

@@ -8,12 +8,12 @@
#include "integer/scalar_comparison.cuh"
#include "integer/vector_find.h"
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_compute_equality_selectors(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out_list,
CudaRadixCiphertextFFI const *lwe_array_in, uint32_t num_blocks,
const uint64_t *h_decomposed_cleartexts,
int_equality_selectors_buffer<Torus> *mem_ptr, void *const *bsks,
int_equality_selectors_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
uint32_t num_possible_values = mem_ptr->num_possible_values;
@@ -37,7 +37,7 @@ __host__ void host_compute_equality_selectors(
CudaRadixCiphertextFFI *current_tmp_block_comparisons =
mem_ptr->tmp_block_comparisons[stream_idx];
int_comparison_buffer<Torus> *current_reduction_buffer =
int_comparison_buffer<Torus, KSTorus> *current_reduction_buffer =
mem_ptr->reduction_buffers[stream_idx];
const uint64_t *current_clear_blocks =
@@ -70,26 +70,27 @@ __host__ void host_compute_equality_selectors(
streams);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_compute_equality_selectors(
CudaStreams streams, int_equality_selectors_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_equality_selectors_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_possible_values, uint32_t num_blocks,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_equality_selectors_buffer<Torus>(
*mem_ptr = new int_equality_selectors_buffer<Torus, KSTorus>(
streams, params, num_possible_values, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_create_possible_results(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out_list,
CudaRadixCiphertextFFI const *lwe_array_in_list,
uint32_t num_possible_values, const uint64_t *h_decomposed_cleartexts,
uint32_t num_blocks, int_possible_results_buffer<Torus> *mem_ptr,
uint32_t num_blocks, int_possible_results_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
uint32_t max_packed_value = mem_ptr->max_packed_value;
@@ -116,7 +117,8 @@ __host__ void host_create_possible_results(
uint32_t lut_index = stream_idx * num_lut_accumulators + k;
int_radix_lut<Torus> *current_lut = mem_ptr->stream_luts[lut_index];
int_radix_lut<Torus, KSTorus> *current_lut =
mem_ptr->stream_luts[lut_index];
uint32_t luts_in_this_call = current_lut->num_many_lut;
@@ -152,26 +154,26 @@ __host__ void host_create_possible_results(
streams);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_create_possible_results(
CudaStreams streams, int_possible_results_buffer<Torus> **mem_ptr,
CudaStreams streams, int_possible_results_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_blocks, uint32_t num_possible_values,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_possible_results_buffer<Torus>(
*mem_ptr = new int_possible_results_buffer<Torus, KSTorus>(
streams, params, num_blocks, num_possible_values, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_aggregate_one_hot_vector(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_list,
uint32_t num_input_ciphertexts, uint32_t num_blocks,
int_aggregate_one_hot_buffer<Torus> *mem_ptr, void *const *bsks,
int_aggregate_one_hot_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_radix_params params = mem_ptr->params;
@@ -191,7 +193,7 @@ __host__ void host_aggregate_one_hot_vector(
CudaRadixCiphertextFFI *current_agg =
mem_ptr->partial_aggregated_vectors[s];
CudaRadixCiphertextFFI *current_temp = mem_ptr->partial_temp_vectors[s];
int_radix_lut<Torus> *current_identity_lut =
int_radix_lut<Torus, KSTorus> *current_identity_lut =
mem_ptr->stream_identity_luts[s];
uint32_t start_idx = s * inputs_per_stream;
@@ -322,27 +324,27 @@ __host__ void host_aggregate_one_hot_vector(
}
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_aggregate_one_hot_vector(
CudaStreams streams, int_aggregate_one_hot_buffer<Torus> **mem_ptr,
CudaStreams streams, int_aggregate_one_hot_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_blocks, uint32_t num_matches,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_aggregate_one_hot_buffer<Torus>(
*mem_ptr = new int_aggregate_one_hot_buffer<Torus, KSTorus>(
streams, params, num_blocks, num_matches, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_match_value(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out_result,
CudaRadixCiphertextFFI *lwe_array_out_boolean,
CudaRadixCiphertextFFI const *lwe_array_in_ct,
const uint64_t *h_match_inputs, const uint64_t *h_match_outputs,
int_unchecked_match_buffer<Torus> *mem_ptr, void *const *bsks,
int_unchecked_match_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
host_compute_equality_selectors<Torus>(
streams, mem_ptr->selectors_list, lwe_array_in_ct,
@@ -382,44 +384,45 @@ __host__ void host_unchecked_match_value(
mem_ptr->num_matches);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_match_value(
CudaStreams streams, int_unchecked_match_buffer<Torus> **mem_ptr,
CudaStreams streams, int_unchecked_match_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_matches, uint32_t num_input_blocks,
uint32_t num_output_packed_blocks, bool max_output_is_zero,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_match_buffer<Torus>(
*mem_ptr = new int_unchecked_match_buffer<Torus, KSTorus>(
streams, params, num_matches, num_input_blocks, num_output_packed_blocks,
max_output_is_zero, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_match_value_or(
CudaStreams streams, int_unchecked_match_value_or_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_unchecked_match_value_or_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_matches, uint32_t num_input_blocks,
uint32_t num_match_packed_blocks, uint32_t num_final_blocks,
bool max_output_is_zero, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_match_value_or_buffer<Torus>(
*mem_ptr = new int_unchecked_match_value_or_buffer<Torus, KSTorus>(
streams, params, num_matches, num_input_blocks, num_match_packed_blocks,
num_final_blocks, max_output_is_zero, allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_match_value_or(
CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
CudaRadixCiphertextFFI const *lwe_array_in_ct,
const uint64_t *h_match_inputs, const uint64_t *h_match_outputs,
const uint64_t *h_or_value,
int_unchecked_match_value_or_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_unchecked_match_value_or_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
host_unchecked_match_value<Torus>(streams, mem_ptr->tmp_match_result,
mem_ptr->tmp_match_bool, lwe_array_in_ct,
@@ -440,28 +443,28 @@ __host__ void host_unchecked_match_value_or(
mem_ptr->cmux_buffer, bsks, (Torus **)ksks);
}
template <typename Torus>
uint64_t
scratch_cuda_unchecked_contains(CudaStreams streams,
int_unchecked_contains_buffer<Torus> **mem_ptr,
int_radix_params params, uint32_t num_inputs,
uint32_t num_blocks, bool allocate_gpu_memory) {
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_contains(
CudaStreams streams,
int_unchecked_contains_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_contains_buffer<Torus>(
*mem_ptr = new int_unchecked_contains_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void
host_unchecked_contains(CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value,
uint32_t num_inputs, uint32_t num_blocks,
int_unchecked_contains_buffer<Torus> *mem_ptr,
int_unchecked_contains_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
@@ -492,27 +495,28 @@ host_unchecked_contains(CudaStreams streams, CudaRadixCiphertextFFI *output,
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_contains_clear(
CudaStreams streams, int_unchecked_contains_clear_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_unchecked_contains_clear_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_contains_clear_buffer<Torus>(
*mem_ptr = new int_unchecked_contains_clear_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_contains_clear(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *inputs, const uint64_t *h_clear_val,
uint32_t num_inputs, uint32_t num_blocks,
int_unchecked_contains_clear_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_unchecked_contains_clear_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val,
num_blocks * sizeof(Torus), streams.stream(0),
@@ -552,28 +556,28 @@ __host__ void host_unchecked_contains_clear(
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_is_in_clears(
CudaStreams streams, int_unchecked_is_in_clears_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_unchecked_is_in_clears_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_clears, uint32_t num_blocks,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_is_in_clears_buffer<Torus>(
*mem_ptr = new int_unchecked_is_in_clears_buffer<Torus, KSTorus>(
streams, params, num_clears, num_blocks, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
__host__ void
host_unchecked_is_in_clears(CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
const uint64_t *h_cleartexts, uint32_t num_clears,
uint32_t num_blocks,
int_unchecked_is_in_clears_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_is_in_clears(
CudaStreams streams, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input, const uint64_t *h_cleartexts,
uint32_t num_clears, uint32_t num_blocks,
int_unchecked_is_in_clears_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
host_compute_equality_selectors<Torus>(streams, mem_ptr->unpacked_selectors,
input, num_blocks, h_cleartexts,
@@ -584,13 +588,13 @@ host_unchecked_is_in_clears(CudaStreams streams, CudaRadixCiphertextFFI *output,
bsks, (Torus **)ksks, num_clears);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_compute_final_index_from_selectors(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *selectors,
uint32_t num_inputs, uint32_t num_blocks_index,
int_final_index_from_selectors_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_final_index_from_selectors_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
for (uint32_t i = 0; i < num_inputs; i++) {
CudaRadixCiphertextFFI const *src_selector = &selectors[i];
@@ -616,42 +620,44 @@ __host__ void host_compute_final_index_from_selectors(
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_compute_final_index_from_selectors(
CudaStreams streams, int_final_index_from_selectors_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_final_index_from_selectors_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks_index,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_final_index_from_selectors_buffer<Torus>(
*mem_ptr = new int_final_index_from_selectors_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_index_in_clears(
CudaStreams streams, int_unchecked_index_in_clears_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_unchecked_index_in_clears_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_clears, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_index_in_clears_buffer<Torus>(
*mem_ptr = new int_unchecked_index_in_clears_buffer<Torus, KSTorus>(
streams, params, num_clears, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_index_in_clears(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
const uint64_t *h_cleartexts, uint32_t num_clears, uint32_t num_blocks,
uint32_t num_blocks_index,
int_unchecked_index_in_clears_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_unchecked_index_in_clears_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
host_compute_equality_selectors<Torus>(
streams, mem_ptr->final_index_buf->unpacked_selectors, input, num_blocks,
@@ -676,28 +682,28 @@ __host__ void host_unchecked_index_in_clears(
num_clears);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_first_index_in_clears(
CudaStreams streams,
int_unchecked_first_index_in_clears_buffer<Torus> **mem_ptr,
int_unchecked_first_index_in_clears_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_unique, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_first_index_in_clears_buffer<Torus>(
*mem_ptr = new int_unchecked_first_index_in_clears_buffer<Torus, KSTorus>(
streams, params, num_unique, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_first_index_in_clears(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *input,
const uint64_t *h_unique_values, const uint64_t *h_unique_indices,
uint32_t num_unique, uint32_t num_blocks, uint32_t num_blocks_index,
int_unchecked_first_index_in_clears_buffer<Torus> *mem_ptr,
int_unchecked_first_index_in_clears_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
host_compute_equality_selectors<Torus>(streams, mem_ptr->unpacked_selectors,
@@ -720,28 +726,28 @@ __host__ void host_unchecked_first_index_in_clears(
bsks, (Torus **)ksks, num_unique);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_first_index_of_clear(
CudaStreams streams,
int_unchecked_first_index_of_clear_buffer<Torus> **mem_ptr,
int_unchecked_first_index_of_clear_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_first_index_of_clear_buffer<Torus>(
*mem_ptr = new int_unchecked_first_index_of_clear_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_first_index_of_clear(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
const uint64_t *h_clear_val, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index,
int_unchecked_first_index_of_clear_buffer<Torus> *mem_ptr,
int_unchecked_first_index_of_clear_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
cuda_memcpy_async_to_gpu(mem_ptr->d_clear_val, h_clear_val,
@@ -813,28 +819,29 @@ __host__ void host_unchecked_first_index_of_clear(
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_first_index_of(
CudaStreams streams, int_unchecked_first_index_of_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_unchecked_first_index_of_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_first_index_of_buffer<Torus>(
*mem_ptr = new int_unchecked_first_index_of_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_first_index_of(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
uint32_t num_blocks, uint32_t num_blocks_index,
int_unchecked_first_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_unchecked_first_index_of_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
@@ -895,27 +902,28 @@ __host__ void host_unchecked_first_index_of(
bsks, (Torus **)ksks, num_inputs);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_index_of(
CudaStreams streams, int_unchecked_index_of_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_unchecked_index_of_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_index_of_buffer<Torus>(
*mem_ptr = new int_unchecked_index_of_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_index_of(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
CudaRadixCiphertextFFI const *value, uint32_t num_inputs,
uint32_t num_blocks, uint32_t num_blocks_index,
int_unchecked_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
int_unchecked_index_of_buffer<Torus, KSTorus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
@@ -961,29 +969,30 @@ __host__ void host_unchecked_index_of(
num_inputs);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
uint64_t scratch_cuda_unchecked_index_of_clear(
CudaStreams streams, int_unchecked_index_of_clear_buffer<Torus> **mem_ptr,
CudaStreams streams,
int_unchecked_index_of_clear_buffer<Torus, KSTorus> **mem_ptr,
int_radix_params params, uint32_t num_inputs, uint32_t num_blocks,
uint32_t num_blocks_index, bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new int_unchecked_index_of_clear_buffer<Torus>(
*mem_ptr = new int_unchecked_index_of_clear_buffer<Torus, KSTorus>(
streams, params, num_inputs, num_blocks, num_blocks_index,
allocate_gpu_memory, size_tracker);
return size_tracker;
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ void host_unchecked_index_of_clear(
CudaStreams streams, CudaRadixCiphertextFFI *index_ct,
CudaRadixCiphertextFFI *match_ct, CudaRadixCiphertextFFI const *inputs,
const Torus *d_scalar_blocks, bool is_scalar_obviously_bigger,
uint32_t num_inputs, uint32_t num_blocks, uint32_t num_scalar_blocks,
uint32_t num_blocks_index,
int_unchecked_index_of_clear_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
int_unchecked_index_of_clear_buffer<Torus, KSTorus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
CudaRadixCiphertextFFI *packed_selectors =
mem_ptr->final_index_buf->packed_selectors;

View File

@@ -289,13 +289,23 @@ void execute_pbs_async(CudaStreams streams,
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
num_inputs_on_gpu, num_many_lut, lut_stride);
if constexpr (std::is_same_v<InputTorus, uint32_t>) {
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
} else {
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
}
}
break;
case CLASSICAL:
@@ -318,13 +328,23 @@ void execute_pbs_async(CudaStreams streams,
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu,
num_many_lut, lut_stride);
if constexpr (std::is_same_v<InputTorus, uint32_t>) {
cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count,
num_inputs_on_gpu, num_many_lut, lut_stride);
} else {
cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count,
num_inputs_on_gpu, num_many_lut, lut_stride);
}
}
break;
default:

View File

@@ -692,6 +692,13 @@ scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template uint64_t
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template void
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
uint64_t>(

View File

@@ -35,7 +35,7 @@ uint64_t scratch_cuda_expand_without_verification_64(
return scratch_cuda_expand_without_verification<uint64_t>(
CudaStreams(streams),
reinterpret_cast<zk_expand_mem<uint64_t> **>(mem_ptr),
reinterpret_cast<zk_expand_mem<uint64_t, uint64_t> **>(mem_ptr),
num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
computing_params, casting_params, casting_key_type, allocate_gpu_memory);
}
@@ -45,53 +45,55 @@ void cuda_expand_without_verification_64(
const void *lwe_flattened_compact_array_in, int8_t *mem_ptr,
void *const *bsks, void *const *computing_ksks, void *const *casting_keys) {
auto expand_buffer = reinterpret_cast<zk_expand_mem<uint64_t> *>(mem_ptr);
auto expand_buffer =
reinterpret_cast<zk_expand_mem<uint64_t, uint64_t> *>(mem_ptr);
switch (expand_buffer->casting_params.big_lwe_dimension) {
case 256:
host_expand_without_verification<uint64_t, AmortizedDegree<256>>(
host_expand_without_verification<uint64_t, uint64_t, AmortizedDegree<256>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks));
break;
case 512:
host_expand_without_verification<uint64_t, AmortizedDegree<512>>(
host_expand_without_verification<uint64_t, uint64_t, AmortizedDegree<512>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks));
break;
case 1024:
host_expand_without_verification<uint64_t, AmortizedDegree<1024>>(
host_expand_without_verification<uint64_t, uint64_t, AmortizedDegree<1024>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks));
break;
case 2048:
host_expand_without_verification<uint64_t, AmortizedDegree<2048>>(
host_expand_without_verification<uint64_t, uint64_t, AmortizedDegree<2048>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks));
break;
case 4096:
host_expand_without_verification<uint64_t, AmortizedDegree<4096>>(
host_expand_without_verification<uint64_t, uint64_t, AmortizedDegree<4096>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks));
break;
case 8192:
host_expand_without_verification<uint64_t, AmortizedDegree<8192>>(
host_expand_without_verification<uint64_t, uint64_t, AmortizedDegree<8192>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
(uint64_t **)(computing_ksks));
break;
case 16384:
host_expand_without_verification<uint64_t, AmortizedDegree<16384>>(
host_expand_without_verification<uint64_t, uint64_t,
AmortizedDegree<16384>>(
streams, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_flattened_compact_array_in),
expand_buffer, (uint64_t **)casting_keys, bsks,
@@ -108,8 +110,8 @@ void cuda_expand_without_verification_64(
void cleanup_expand_without_verification_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void) {
zk_expand_mem<uint64_t> *mem_ptr =
reinterpret_cast<zk_expand_mem<uint64_t> *>(*mem_ptr_void);
zk_expand_mem<uint64_t, uint64_t> *mem_ptr =
reinterpret_cast<zk_expand_mem<uint64_t, uint64_t> *>(*mem_ptr_void);
mem_ptr->release(CudaStreams(streams));
delete mem_ptr;
*mem_ptr_void = nullptr;

View File

@@ -15,11 +15,13 @@
#include "zk/zk_utilities.h"
#include <functional>
template <typename Torus, class params>
__host__ void host_expand_without_verification(
CudaStreams streams, Torus *lwe_array_out,
const Torus *lwe_flattened_compact_array_in, zk_expand_mem<Torus> *mem_ptr,
Torus *const *casting_keys, void *const *bsks, Torus *const *compute_ksks) {
template <typename Torus, typename KSTorus, class params>
__host__ void
host_expand_without_verification(CudaStreams streams, Torus *lwe_array_out,
const Torus *lwe_flattened_compact_array_in,
zk_expand_mem<Torus, KSTorus> *mem_ptr,
Torus *const *casting_keys, void *const *bsks,
Torus *const *compute_ksks) {
// Expand
auto casting_key_type = mem_ptr->casting_key_type;
auto expanded_lwes = mem_ptr->tmp_expanded_lwes;
@@ -100,16 +102,16 @@ __host__ void host_expand_without_verification(
2 * num_lwes);
}
template <typename Torus>
template <typename Torus, typename KSTorus>
__host__ uint64_t scratch_cuda_expand_without_verification(
CudaStreams streams, zk_expand_mem<Torus> **mem_ptr,
CudaStreams streams, zk_expand_mem<Torus, KSTorus> **mem_ptr,
const uint32_t *num_lwes_per_compact_list, const bool *is_boolean_array,
uint32_t num_compact_lists, int_radix_params computing_params,
int_radix_params casting_params, KS_TYPE casting_key_type,
bool allocate_gpu_memory) {
uint64_t size_tracker = 0;
*mem_ptr = new zk_expand_mem<Torus>(
*mem_ptr = new zk_expand_mem<Torus, KSTorus>(
streams, computing_params, casting_params, casting_key_type,
num_lwes_per_compact_list, is_boolean_array, num_compact_lists,
allocate_gpu_memory, size_tracker);

View File

@@ -206,7 +206,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
return;
}
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true);