mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-06 21:34:05 -05:00
refactor(gpu): Threshold for multi-GPU with Classical PBS
This commit is contained in:
committed by
Agnès Leroy
parent
0a59e86675
commit
ca2a79f1fb
@@ -35,7 +35,8 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, and_lambda, allocate_gpu_memory);
|
||||
auto active_streams_and_lut = streams.active_gpu_subset(
|
||||
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
|
||||
SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
|
||||
params.pbs_type);
|
||||
this->and_lut->broadcast_lut(active_streams_and_lut);
|
||||
this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
|
||||
@@ -50,8 +51,8 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, flush_lambda, allocate_gpu_memory);
|
||||
auto active_streams_flush_lut =
|
||||
streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
|
||||
auto active_streams_flush_lut = streams.active_gpu_subset(
|
||||
AES_STATE_BITS * num_aes_inputs, params.pbs_type);
|
||||
this->flush_lut->broadcast_lut(active_streams_flush_lut);
|
||||
this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
|
||||
@@ -65,7 +66,8 @@ template <typename Torus> struct int_aes_lut_buffers {
|
||||
this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, carry_lambda, allocate_gpu_memory);
|
||||
auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
|
||||
auto active_streams_carry_lut =
|
||||
streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
|
||||
this->carry_lut->broadcast_lut(active_streams_carry_lut);
|
||||
this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
|
||||
}
|
||||
|
||||
@@ -8,7 +8,8 @@
|
||||
|
||||
extern std::mutex m;
|
||||
extern bool p2p_enabled;
|
||||
extern const int THRESHOLD_MULTI_GPU;
|
||||
extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
|
||||
extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
|
||||
extern "C" {
|
||||
int32_t cuda_setup_multi_gpu(int device_0_id);
|
||||
@@ -39,7 +40,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
|
||||
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
|
||||
PBS_TYPE pbs_type);
|
||||
|
||||
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
|
||||
|
||||
@@ -73,9 +75,10 @@ public:
|
||||
|
||||
// Returns a subset of this set as an active subset. An active subset is one
|
||||
// that is temporarily used to perform some computation
|
||||
CudaStreams active_gpu_subset(int num_radix_blocks) {
|
||||
return CudaStreams(_streams, _gpu_indexes,
|
||||
get_active_gpu_count(num_radix_blocks, _gpu_count));
|
||||
CudaStreams active_gpu_subset(int num_radix_blocks, PBS_TYPE pbs_type) {
|
||||
return CudaStreams(
|
||||
_streams, _gpu_indexes,
|
||||
get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
|
||||
}
|
||||
|
||||
// Returns a CudaStreams struct containing only the ith stream
|
||||
|
||||
@@ -20,7 +20,8 @@ template <typename Torus> struct boolean_bitop_buffer {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->op = op;
|
||||
this->params = params;
|
||||
auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
|
||||
this->unchecked = is_unchecked;
|
||||
switch (op) {
|
||||
case BITAND:
|
||||
@@ -119,7 +120,8 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->op = op;
|
||||
this->params = params;
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
switch (op) {
|
||||
case BITAND:
|
||||
case BITOR:
|
||||
@@ -216,7 +218,8 @@ template <typename Torus> struct boolean_bitnot_buffer {
|
||||
message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_message_extract, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
|
||||
message_extract_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -39,7 +39,8 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
|
||||
},
|
||||
allocate_gpu_memory);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
this->last_block = new CudaRadixCiphertextFFI;
|
||||
|
||||
@@ -14,7 +14,8 @@ template <typename Torus> struct int_zero_out_if_buffer {
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->params = params;
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
|
||||
tmp = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -114,9 +115,11 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
auto active_streams_pred = streams.active_gpu_subset(2 * num_radix_blocks);
|
||||
auto active_streams_pred =
|
||||
streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
|
||||
predicate_lut->broadcast_lut(active_streams_pred);
|
||||
auto active_streams_msg = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams_msg =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
message_extract_lut->broadcast_lut(active_streams_msg);
|
||||
}
|
||||
|
||||
|
||||
@@ -52,7 +52,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(max_chunks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(max_chunks, params.pbs_type);
|
||||
is_max_value->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
@@ -108,7 +109,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
is_non_zero_lut->broadcast_lut(active_streams);
|
||||
|
||||
// Scalar may have up to num_radix_blocks blocks
|
||||
@@ -238,7 +240,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
block_selector_f, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
tree_inner_leaf_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
@@ -390,7 +393,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
this->op = op;
|
||||
this->is_signed = is_signed;
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
|
||||
identity_lut_f = [](Torus x) -> Torus { return x; };
|
||||
|
||||
@@ -523,7 +527,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
signed_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
preallocated_h_lut = (Torus *)malloc(
|
||||
|
||||
@@ -116,7 +116,8 @@ template <typename Torus> struct int_decompression {
|
||||
effective_compression_carry_modulus,
|
||||
encryption_params.message_modulus, encryption_params.carry_modulus,
|
||||
decompression_rescale_f, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
|
||||
auto active_streams = streams.active_gpu_subset(
|
||||
num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
|
||||
decompression_rescale_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -356,7 +356,8 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
luts[j]->broadcast_lut(active_streams);
|
||||
}
|
||||
}
|
||||
@@ -1012,7 +1013,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_masking, gpu_memory_allocated);
|
||||
auto active_streams_1 = streams.active_gpu_subset(1);
|
||||
auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
|
||||
masking_luts_1[i]->broadcast_lut(active_streams_1);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
@@ -1021,7 +1022,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_masking, gpu_memory_allocated);
|
||||
auto active_streams_2 = streams.active_gpu_subset(num_blocks);
|
||||
auto active_streams_2 =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
masking_luts_2[i]->broadcast_lut(active_streams_2);
|
||||
}
|
||||
|
||||
@@ -1040,7 +1042,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
|
||||
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
|
||||
message_extract_lut_2};
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
|
||||
@@ -1128,7 +1131,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
|
||||
// merge_overflow_flags_luts
|
||||
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
|
||||
auto active_gpu_count_for_bits = streams.active_gpu_subset(1);
|
||||
auto active_gpu_count_for_bits =
|
||||
streams.active_gpu_subset(1, params.pbs_type);
|
||||
for (int i = 0; i < num_bits_in_message; i++) {
|
||||
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
|
||||
return (x == 0 && y == 0) << i;
|
||||
@@ -1152,7 +1156,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
uint32_t num_blocks, bool allocate_gpu_memory,
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
auto active_streams = streams.active_gpu_subset(2 * num_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(2 * num_blocks, params.pbs_type);
|
||||
this->params = params;
|
||||
|
||||
if (params.message_modulus == 4 && params.carry_modulus == 4 &&
|
||||
@@ -1473,7 +1478,8 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
this->params = params;
|
||||
this->is_signed = is_signed;
|
||||
|
||||
@@ -1559,7 +1565,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
f_compare_extracted_signed_bits, gpu_memory_allocated);
|
||||
auto active_gpu_count_cmp =
|
||||
streams.active_gpu_subset(1); // only 1 block needed
|
||||
streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
|
||||
compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
this->direction = direction;
|
||||
this->bit_value = bit_value;
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
this->univ_lut_mem =
|
||||
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
@@ -246,7 +247,8 @@ template <typename Torus> struct int_ilog2_buffer {
|
||||
params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus,
|
||||
lut_message_lambda, allocate_gpu_memory);
|
||||
auto active_streams = streams.active_gpu_subset(counter_num_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
|
||||
lut_message_not->broadcast_lut(active_streams);
|
||||
|
||||
this->lut_carry_not =
|
||||
|
||||
@@ -371,7 +371,8 @@ struct int_radix_lut_custom_input_output {
|
||||
this->num_input_blocks = num_input_blocks;
|
||||
this->gpu_memory_allocated = allocate_gpu_memory;
|
||||
|
||||
this->active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
}
|
||||
|
||||
void setup_degrees() {
|
||||
@@ -382,14 +383,18 @@ struct int_radix_lut_custom_input_output {
|
||||
|
||||
void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
|
||||
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
||||
|
||||
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
|
||||
for (uint i = 0; i < active_streams.count(); i++) {
|
||||
cuda_set_device(active_streams.gpu_index(i));
|
||||
int8_t *gpu_pbs_buffer;
|
||||
auto num_blocks_on_gpu =
|
||||
std::min((int)num_radix_blocks,
|
||||
std::max(THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i,
|
||||
active_streams.count())));
|
||||
auto num_blocks_on_gpu = std::min(
|
||||
(int)num_radix_blocks,
|
||||
std::max(threshold, get_num_inputs_on_gpu(num_radix_blocks, i,
|
||||
active_streams.count())));
|
||||
|
||||
uint64_t size = 0;
|
||||
execute_scratch_pbs<OutputTorus>(
|
||||
@@ -424,18 +429,22 @@ struct int_radix_lut_custom_input_output {
|
||||
/// back to the original indexing
|
||||
multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
|
||||
num_radix_blocks, params.big_lwe_dimension + 1,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
size_tracker, params.pbs_type,
|
||||
allocate_gpu_memory);
|
||||
multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
|
||||
num_radix_blocks, params.small_lwe_dimension + 1,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
size_tracker, params.pbs_type,
|
||||
allocate_gpu_memory);
|
||||
if (num_many_lut > 1) {
|
||||
multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
|
||||
params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
|
||||
params.big_lwe_dimension + 1, size_tracker, params.pbs_type,
|
||||
allocate_gpu_memory);
|
||||
} else {
|
||||
multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
|
||||
num_radix_blocks, params.big_lwe_dimension + 1,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
size_tracker, params.pbs_type,
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
|
||||
num_radix_blocks, size_tracker,
|
||||
@@ -451,12 +460,14 @@ struct int_radix_lut_custom_input_output {
|
||||
}
|
||||
|
||||
void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
|
||||
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
|
||||
auto inputs_on_gpu =
|
||||
std::min((int)num_input_blocks,
|
||||
std::max(THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(num_input_blocks, 0,
|
||||
active_streams.count())));
|
||||
auto inputs_on_gpu = std::min(
|
||||
(int)num_input_blocks,
|
||||
std::max(threshold, get_num_inputs_on_gpu(num_input_blocks, 0,
|
||||
active_streams.count())));
|
||||
|
||||
if (inputs_on_gpu >= get_threshold_ks_gemm()) {
|
||||
for (auto i = 0; i < active_streams.count(); ++i) {
|
||||
@@ -798,16 +809,20 @@ struct int_radix_lut_custom_input_output {
|
||||
void allocate_lwe_vector_for_non_trivial_indexes(
|
||||
CudaStreams streams, uint64_t max_num_radix_blocks,
|
||||
uint64_t &size_tracker, bool allocate_gpu_memory) {
|
||||
|
||||
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
|
||||
// We need to create the auxiliary array only in GPU 0
|
||||
if (active_streams.count() > 1) {
|
||||
lwe_aligned_vec.resize(active_streams.count());
|
||||
for (uint i = 0; i < active_streams.count(); i++) {
|
||||
uint64_t size_tracker_on_array_i = 0;
|
||||
auto inputs_on_gpu =
|
||||
std::min((int)max_num_radix_blocks,
|
||||
std::max(THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(max_num_radix_blocks, i,
|
||||
active_streams.count())));
|
||||
auto inputs_on_gpu = std::min(
|
||||
(int)max_num_radix_blocks,
|
||||
std::max(threshold, get_num_inputs_on_gpu(max_num_radix_blocks, i,
|
||||
active_streams.count())));
|
||||
InputTorus *d_array =
|
||||
(InputTorus *)cuda_malloc_with_size_tracking_async(
|
||||
inputs_on_gpu * (params.big_lwe_dimension + 1) *
|
||||
@@ -998,8 +1013,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
num_radix_blocks * bits_per_block * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(bits_per_block * num_radix_blocks);
|
||||
auto active_streams = streams.active_gpu_subset(
|
||||
bits_per_block * num_radix_blocks, params.pbs_type);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
/**
|
||||
@@ -1266,7 +1281,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, gpu_memory_allocated);
|
||||
auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
|
||||
auto active_gpu_count_mc =
|
||||
streams.active_gpu_subset(pbs_count, params.pbs_type);
|
||||
luts_message_carry->broadcast_lut(active_gpu_count_mc);
|
||||
}
|
||||
}
|
||||
@@ -1436,7 +1452,8 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
auto active_streams = streams.active_gpu_subset(num_seq_luts);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_seq_luts, params.pbs_type);
|
||||
lut_sequential_algorithm->broadcast_lut(active_streams);
|
||||
free(h_seq_lut_indexes);
|
||||
};
|
||||
@@ -1490,7 +1507,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_lut_hillis_steele,
|
||||
gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(num_groups);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_groups, params.pbs_type);
|
||||
lut_hillis_steele->broadcast_lut(active_streams);
|
||||
};
|
||||
void release(CudaStreams streams) {
|
||||
@@ -1667,7 +1685,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
// Do I need to do something else for the multi-gpu?
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
luts_array_first_step->broadcast_lut(active_streams);
|
||||
};
|
||||
void release(CudaStreams streams) {
|
||||
@@ -1932,7 +1951,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
scalar_array_cum_sum, h_scalar_array_cum_sum,
|
||||
num_radix_blocks * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
luts_array_second_step->broadcast_lut(active_streams);
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
@@ -1957,7 +1977,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
|
||||
auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
|
||||
auto new_active_streams = streams.active_gpu_subset(
|
||||
new_num_blocks, luts_array_second_step->params.pbs_type);
|
||||
// We just need to update the lut indexes so we use false here
|
||||
luts_array_second_step->broadcast_lut(new_active_streams, false);
|
||||
|
||||
@@ -2124,7 +2145,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
|
||||
gpu_memory_allocated);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
lut_overflow_flag_prep->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
@@ -2196,7 +2217,8 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
}
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks + 1);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
|
||||
lut_message_extract->broadcast_lut(active_streams);
|
||||
};
|
||||
|
||||
@@ -2393,7 +2415,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
|
||||
lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
// Do I need to do something else for the multi-gpu?
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
luts_array_first_step->broadcast_lut(active_streams);
|
||||
};
|
||||
|
||||
@@ -2404,7 +2427,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
|
||||
auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
|
||||
auto new_active_streams = streams.active_gpu_subset(
|
||||
new_num_blocks, luts_array_first_step->params.pbs_type);
|
||||
// We just need to update the lut indexes so we use false here
|
||||
luts_array_first_step->broadcast_lut(new_active_streams, false);
|
||||
}
|
||||
@@ -2499,7 +2523,8 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
|
||||
lut_message_extract->broadcast_lut(active_streams);
|
||||
|
||||
@@ -2520,7 +2545,8 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
lut_borrow_flag->broadcast_lut(active_streams);
|
||||
}
|
||||
|
||||
active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
|
||||
2);
|
||||
};
|
||||
|
||||
@@ -45,7 +45,8 @@ template <typename Torus> struct int_mul_memory {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
zero_out_predicate_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
zero_out_predicate_lut->broadcast_lut(active_streams);
|
||||
|
||||
zero_out_mem = new int_zero_out_if_buffer<Torus>(
|
||||
@@ -122,7 +123,8 @@ template <typename Torus> struct int_mul_memory {
|
||||
streams.stream(0), streams.gpu_index(0),
|
||||
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
|
||||
msb_vector_block_count);
|
||||
auto active_streams = streams.active_gpu_subset(total_block_count);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(total_block_count, params.pbs_type);
|
||||
luts_array->broadcast_lut(active_streams);
|
||||
// create memory object for sum ciphertexts
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
|
||||
@@ -126,7 +126,8 @@ template <typename Torus> struct int_grouped_oprf_memory {
|
||||
luts->get_lut_indexes(0, 0), this->h_lut_indexes,
|
||||
num_blocks_to_process * sizeof(Torus), streams.stream(0),
|
||||
streams.gpu_index(0), allocate_gpu_memory);
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
|
||||
luts->broadcast_lut(active_streams);
|
||||
|
||||
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
||||
|
||||
@@ -91,7 +91,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cur_lut_bivariate->broadcast_lut(active_streams);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
@@ -177,7 +178,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cur_lut_bivariate->broadcast_lut(active_streams);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
@@ -220,7 +222,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
uint64_t &size_tracker) {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
// In the arithmetic shift, a PBS has to be applied to the last rotated
|
||||
// block twice: once to shift it, once to compute the padding block to be
|
||||
// copied onto all blocks to the left of the last rotated block
|
||||
@@ -276,7 +278,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
shift_last_block_lut_univariate->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
|
||||
auto active_streams_shift_last = streams.active_gpu_subset(1);
|
||||
auto active_streams_shift_last =
|
||||
streams.active_gpu_subset(1, params.pbs_type);
|
||||
shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);
|
||||
|
||||
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
|
||||
@@ -302,7 +305,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
padding_block_lut_f, gpu_memory_allocated);
|
||||
// auto active_streams = streams.active_gpu_subset(1);
|
||||
// auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
padding_block_lut_univariate->broadcast_lut(active_streams);
|
||||
|
||||
lut_buffers_univariate.push_back(padding_block_lut_univariate);
|
||||
@@ -344,7 +347,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
blocks_lut_f, gpu_memory_allocated);
|
||||
auto active_streams_shift_blocks =
|
||||
streams.active_gpu_subset(num_radix_blocks);
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);
|
||||
|
||||
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
|
||||
|
||||
@@ -119,8 +119,8 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count_mux =
|
||||
streams.active_gpu_subset(bits_per_block * num_radix_blocks);
|
||||
auto active_gpu_count_mux = streams.active_gpu_subset(
|
||||
bits_per_block * num_radix_blocks, params.pbs_type);
|
||||
mux_lut->broadcast_lut(active_gpu_count_mux);
|
||||
|
||||
auto cleaning_lut_f = [params](Torus x) -> Torus {
|
||||
@@ -132,7 +132,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count_cleaning =
|
||||
streams.active_gpu_subset(num_radix_blocks);
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
|
||||
}
|
||||
|
||||
|
||||
@@ -108,7 +108,8 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_message_acc, gpu_memory_allocated);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
luts_array->broadcast_lut(active_streams);
|
||||
luts_borrow_propagation_sum->broadcast_lut(active_streams);
|
||||
message_acc->broadcast_lut(active_streams);
|
||||
|
||||
@@ -38,7 +38,8 @@ template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
|
||||
num_streams_to_use = 1;
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
uint32_t num_gpus = active_streams.count();
|
||||
|
||||
|
||||
@@ -40,7 +40,8 @@ template <typename Torus> struct int_equality_selectors_buffer {
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams_to_use);
|
||||
@@ -154,7 +155,8 @@ template <typename Torus> struct int_possible_results_buffer {
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams_to_use);
|
||||
@@ -207,7 +209,8 @@ template <typename Torus> struct int_possible_results_buffer {
|
||||
params.message_modulus, params.carry_modulus, fns,
|
||||
allocate_gpu_memory);
|
||||
|
||||
current_lut->broadcast_lut(streams.active_gpu_subset(1));
|
||||
current_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(1, params.pbs_type));
|
||||
stream_luts[lut_count++] = current_lut;
|
||||
lut_value_start += luts_in_this_call;
|
||||
}
|
||||
@@ -282,7 +285,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams);
|
||||
@@ -300,7 +304,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
id_fn, allocate_gpu_memory);
|
||||
|
||||
lut->broadcast_lut(streams.active_gpu_subset(num_blocks));
|
||||
lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
this->stream_identity_luts[i] = lut;
|
||||
}
|
||||
|
||||
@@ -321,7 +326,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
msg_fn, allocate_gpu_memory);
|
||||
this->message_extract_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks));
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
|
||||
this->carry_extract_lut = new int_radix_lut<Torus>(
|
||||
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
|
||||
@@ -333,7 +338,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
carry_fn, allocate_gpu_memory);
|
||||
this->carry_extract_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_blocks));
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type));
|
||||
|
||||
this->partial_aggregated_vectors =
|
||||
new CudaRadixCiphertextFFI *[num_streams];
|
||||
@@ -628,7 +633,8 @@ template <typename Torus> struct int_unchecked_contains_buffer {
|
||||
num_streams_to_use = 1;
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams_to_use);
|
||||
@@ -703,7 +709,8 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
|
||||
num_streams_to_use = 1;
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams_to_use);
|
||||
@@ -1094,7 +1101,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
|
||||
num_streams_to_use = 1;
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams_to_use);
|
||||
@@ -1184,7 +1192,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
|
||||
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
prefix_sum_fn, allocate_gpu_memory);
|
||||
this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
|
||||
this->prefix_sum_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
|
||||
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
|
||||
Torus val = x % params.message_modulus;
|
||||
@@ -1200,7 +1209,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
|
||||
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
cleanup_fn, allocate_gpu_memory);
|
||||
this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
|
||||
this->cleanup_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -1292,7 +1302,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
|
||||
num_streams_to_use = 1;
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams_to_use);
|
||||
@@ -1372,7 +1383,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
|
||||
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
prefix_sum_fn, allocate_gpu_memory);
|
||||
this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
|
||||
this->prefix_sum_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
|
||||
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
|
||||
Torus val = x % params.message_modulus;
|
||||
@@ -1388,7 +1400,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
|
||||
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
cleanup_fn, allocate_gpu_memory);
|
||||
this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
|
||||
this->cleanup_lut->broadcast_lut(
|
||||
streams.active_gpu_subset(num_inputs, params.pbs_type));
|
||||
}
|
||||
|
||||
void release(CudaStreams streams) {
|
||||
@@ -1462,7 +1475,8 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
|
||||
num_streams_to_use = 1;
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams_to_use);
|
||||
@@ -1523,7 +1537,8 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
|
||||
num_streams_to_use = 1;
|
||||
|
||||
this->num_streams = num_streams_to_use;
|
||||
this->active_streams = streams.active_gpu_subset(num_blocks);
|
||||
this->active_streams =
|
||||
streams.active_gpu_subset(num_blocks, params.pbs_type);
|
||||
|
||||
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
|
||||
active_streams, num_streams_to_use);
|
||||
|
||||
@@ -289,7 +289,8 @@ template <typename Torus> struct zk_expand_mem {
|
||||
lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(2 * num_lwes);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
|
||||
message_and_carry_extract_luts->broadcast_lut(active_streams);
|
||||
|
||||
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
|
||||
@@ -153,7 +153,8 @@ __host__ void are_all_comparisons_block_true(
|
||||
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
|
||||
h_lut_indexes, num_chunks * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
auto active_streams = streams.active_gpu_subset(num_chunks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_chunks, params.pbs_type);
|
||||
is_max_value_lut->broadcast_lut(active_streams);
|
||||
}
|
||||
lut = is_max_value_lut;
|
||||
@@ -172,8 +173,8 @@ __host__ void are_all_comparisons_block_true(
|
||||
is_max_value_lut->h_lut_indexes,
|
||||
is_max_value_lut->num_blocks * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
auto active_gpu_count_is_max =
|
||||
streams.active_gpu_subset(is_max_value_lut->num_blocks);
|
||||
auto active_gpu_count_is_max = streams.active_gpu_subset(
|
||||
is_max_value_lut->num_blocks, params.pbs_type);
|
||||
is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);
|
||||
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, 1);
|
||||
@@ -488,7 +489,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
polynomial_size, message_modulus, carry_modulus, f, true,
|
||||
tree_buffer->preallocated_h_lut);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
last_lut->broadcast_lut(active_streams);
|
||||
|
||||
// Last leaf
|
||||
|
||||
@@ -339,7 +339,9 @@ host_integer_decompress(CudaStreams streams,
|
||||
/// dimension to a big LWE dimension
|
||||
auto encryption_params = h_mem_ptr->encryption_params;
|
||||
auto lut = h_mem_ptr->decompression_rescale_lut;
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
|
||||
auto active_streams = streams.active_gpu_subset(
|
||||
num_blocks_to_decompress,
|
||||
h_mem_ptr->decompression_rescale_lut->params.pbs_type);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
|
||||
|
||||
@@ -542,7 +542,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
|
||||
@@ -645,7 +646,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
|
||||
@@ -764,7 +766,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
|
||||
@@ -1812,7 +1815,8 @@ uint64_t scratch_cuda_apply_univariate_lut(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
(*mem_ptr)->broadcast_lut(active_streams);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
@@ -1847,7 +1851,8 @@ uint64_t scratch_cuda_apply_many_univariate_lut(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
(*mem_ptr)->broadcast_lut(active_streams);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
@@ -1883,7 +1888,8 @@ uint64_t scratch_cuda_apply_bivariate_lut(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
||||
(*mem_ptr)->broadcast_lut(active_streams);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
@@ -2336,8 +2342,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,
|
||||
|
||||
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
|
||||
// from the output ct
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(lwe_array_out->num_radix_blocks);
|
||||
auto active_streams = streams.active_gpu_subset(
|
||||
lwe_array_out->num_radix_blocks, params.pbs_type);
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
|
||||
|
||||
@@ -388,7 +388,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
|
||||
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
||||
needs_processing);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(total_ciphertexts);
|
||||
auto active_streams =
|
||||
streams.active_gpu_subset(total_ciphertexts, mem_ptr->params.pbs_type);
|
||||
GPU_ASSERT(total_ciphertexts <= mem_ptr->luts_message_carry->num_blocks,
|
||||
"SUM CT");
|
||||
|
||||
@@ -442,7 +443,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
|
||||
streams.stream(0), streams.gpu_index(0), current_blocks,
|
||||
num_radix_blocks, num_radix_blocks + 1);
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks);
|
||||
auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks,
|
||||
mem_ptr->params.pbs_type);
|
||||
|
||||
if (active_streams.count() == 1) {
|
||||
execute_keyswitch_async<Torus>(
|
||||
|
||||
@@ -29,7 +29,8 @@ void host_integer_grouped_oprf(CudaStreams streams,
|
||||
int_grouped_oprf_memory<Torus> *mem_ptr,
|
||||
void *const *bsks) {
|
||||
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
|
||||
auto active_streams = streams.active_gpu_subset(num_blocks_to_process,
|
||||
mem_ptr->params.pbs_type);
|
||||
auto lut = mem_ptr->luts;
|
||||
|
||||
if (active_streams.count() == 1) {
|
||||
|
||||
@@ -45,7 +45,8 @@ host_scalar_bitop(CudaStreams streams, CudaRadixCiphertextFFI *output,
|
||||
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
|
||||
num_clear_blocks * sizeof(Torus),
|
||||
streams.stream(0), streams.gpu_index(0));
|
||||
auto active_streams = streams.active_gpu_subset(num_clear_blocks);
|
||||
auto active_streams = streams.active_gpu_subset(
|
||||
num_clear_blocks, mem_ptr->lut->params.pbs_type);
|
||||
lut->broadcast_lut(active_streams, false);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
|
||||
@@ -146,7 +146,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
|
||||
lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
|
||||
true, mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
@@ -240,7 +240,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
scalar_bivariate_last_leaf_lut_f, true,
|
||||
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
@@ -274,7 +274,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, one_block_lut_f, true,
|
||||
mem_ptr->preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
one_block_lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
@@ -419,7 +419,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
|
||||
polynomial_size, message_modulus, carry_modulus,
|
||||
scalar_bivariate_last_leaf_lut_f, true,
|
||||
mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table<Torus>(
|
||||
@@ -521,7 +521,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
|
||||
signed_msb_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f, true, mem_ptr->preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
signed_msb_lut->broadcast_lut(active_streams);
|
||||
|
||||
CudaRadixCiphertextFFI sign_block;
|
||||
@@ -567,7 +567,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, one_block_lut_f, true,
|
||||
mem_ptr->preallocated_h_lut);
|
||||
auto active_streams = streams.active_gpu_subset(1);
|
||||
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
||||
one_block_lut->broadcast_lut(active_streams);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table<Torus>(
|
||||
@@ -785,8 +785,8 @@ __host__ void host_scalar_equality_check(
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_streams.stream(0),
|
||||
lsb_streams.gpu_index(0));
|
||||
}
|
||||
auto active_streams =
|
||||
lsb_streams.active_gpu_subset(num_halved_scalar_blocks);
|
||||
auto active_streams = lsb_streams.active_gpu_subset(
|
||||
num_halved_scalar_blocks, params.pbs_type);
|
||||
// We use false cause we only will broadcast the indexes
|
||||
scalar_comparison_luts->broadcast_lut(active_streams, false);
|
||||
|
||||
|
||||
@@ -5,7 +5,8 @@
|
||||
|
||||
std::mutex m;
|
||||
bool p2p_enabled = false;
|
||||
const int THRESHOLD_MULTI_GPU = 12;
|
||||
const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS = 12;
|
||||
const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS = 68;
|
||||
|
||||
// Enable bidirectional p2p access between all available GPUs and device_0_id
|
||||
int32_t cuda_setup_multi_gpu(int device_0_id) {
|
||||
@@ -39,10 +40,13 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
|
||||
return (int32_t)(num_used_gpus);
|
||||
}
|
||||
|
||||
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count) {
|
||||
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
|
||||
PBS_TYPE pbs_type) {
|
||||
int threshold = (pbs_type == MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
uint32_t ceil_div_inputs =
|
||||
std::max((uint32_t)1,
|
||||
(num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
|
||||
std::max((uint32_t)1, (num_inputs + threshold - 1) / threshold);
|
||||
uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
|
||||
return active_gpu_count;
|
||||
}
|
||||
|
||||
@@ -59,15 +59,20 @@ template <typename Torus>
|
||||
void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
|
||||
uint32_t num_inputs, uint32_t lwe_size,
|
||||
uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory) {
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
PANIC_IF_FALSE(dest.empty(),
|
||||
"Cuda error: Requested multi-GPU vector is already allocated");
|
||||
|
||||
int threshold = (pbs_type == MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
|
||||
dest.resize(streams.count());
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
uint64_t size_tracker_on_gpu_i = 0;
|
||||
auto inputs_on_gpu = std::min(
|
||||
(int)num_inputs,
|
||||
std::max(THRESHOLD_MULTI_GPU,
|
||||
std::max((int)threshold,
|
||||
get_num_inputs_on_gpu(num_inputs, i, streams.count())));
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams.stream(i),
|
||||
@@ -81,7 +86,7 @@ void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
|
||||
|
||||
template void multi_gpu_alloc_lwe_async<__uint128_t>(
|
||||
CudaStreams streams, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
|
||||
uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
|
||||
uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
/// Allocates the input/output vector for all devices
|
||||
@@ -91,16 +96,21 @@ template <typename Torus>
|
||||
void multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
CudaStreams streams, std::vector<Torus *> &dest, uint32_t num_inputs,
|
||||
uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory) {
|
||||
PBS_TYPE pbs_type, bool allocate_gpu_memory) {
|
||||
|
||||
PANIC_IF_FALSE(dest.empty(),
|
||||
"Cuda error: Requested multi-GPU vector is already allocated");
|
||||
|
||||
int threshold = (pbs_type == MULTI_BIT)
|
||||
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
||||
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
||||
|
||||
dest.resize(streams.count());
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
uint64_t size_tracker = 0;
|
||||
auto inputs_on_gpu = std::min(
|
||||
(int)num_inputs,
|
||||
std::max(THRESHOLD_MULTI_GPU,
|
||||
std::max((int)threshold,
|
||||
get_num_inputs_on_gpu(num_inputs, i, streams.count())));
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
|
||||
Reference in New Issue
Block a user