mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
fix(gpu): revert "change broadcast lut to communicate the minimum possible"
This reverts commit baad6a6b49.
This commit is contained in:
@@ -115,10 +115,8 @@ template <typename Torus> struct int_decompression {
|
||||
effective_compression_carry_modulus,
|
||||
encryption_params.message_modulus, encryption_params.carry_modulus,
|
||||
decompression_rescale_f, gpu_memory_allocated);
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(num_blocks_to_decompress, gpu_count);
|
||||
decompression_rescale_lut->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
|
||||
decompression_rescale_lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
|
||||
@@ -320,13 +320,12 @@ template <typename Torus> struct int_radix_lut {
|
||||
std::vector<Torus *> lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec;
|
||||
std::vector<Torus *> lwe_aligned_vec;
|
||||
|
||||
uint32_t *gpu_indexes;
|
||||
bool gpu_memory_allocated;
|
||||
|
||||
cudaEvent_t event_scatter_in;
|
||||
cudaEvent_t *event_scatter_out;
|
||||
cudaEvent_t event_scatter_out[8];
|
||||
cudaEvent_t event_broadcast;
|
||||
|
||||
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
|
||||
@@ -364,19 +363,15 @@ template <typename Torus> struct int_radix_lut {
|
||||
if (i == 0) {
|
||||
size_tracker += size;
|
||||
}
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
buffer.push_back(gpu_pbs_buffer);
|
||||
}
|
||||
|
||||
// We create the events only if we have multiple GPUs
|
||||
if (active_gpu_count > 1) {
|
||||
event_scatter_in = cuda_create_event(gpu_indexes[0]);
|
||||
event_broadcast = cuda_create_event(gpu_indexes[0]);
|
||||
event_scatter_in = cuda_create_event(gpu_indexes[0]);
|
||||
event_broadcast = cuda_create_event(gpu_indexes[0]);
|
||||
|
||||
event_scatter_out =
|
||||
(cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
|
||||
for (int i = 0; i < active_gpu_count; i++) {
|
||||
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
|
||||
}
|
||||
for (int i = 0; i < active_gpu_count; i++) {
|
||||
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
|
||||
}
|
||||
|
||||
// Allocate LUT
|
||||
@@ -397,6 +392,8 @@ template <typename Torus> struct int_radix_lut {
|
||||
|
||||
lut_vec.push_back(lut);
|
||||
lut_indexes_vec.push_back(lut_indexes);
|
||||
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
|
||||
@@ -515,6 +512,8 @@ template <typename Torus> struct int_radix_lut {
|
||||
|
||||
lut_vec.push_back(lut);
|
||||
lut_indexes_vec.push_back(lut_indexes);
|
||||
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
|
||||
@@ -571,6 +570,7 @@ template <typename Torus> struct int_radix_lut {
|
||||
|
||||
///////////////
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_set_device(gpu_indexes[i]);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
@@ -587,18 +587,14 @@ template <typename Torus> struct int_radix_lut {
|
||||
if (i == 0) {
|
||||
size_tracker += size;
|
||||
}
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
buffer.push_back(gpu_pbs_buffer);
|
||||
}
|
||||
// We create the events only if we have multiple GPUs
|
||||
if (active_gpu_count > 1) {
|
||||
event_scatter_in = cuda_create_event(gpu_indexes[0]);
|
||||
event_broadcast = cuda_create_event(gpu_indexes[0]);
|
||||
event_scatter_in = cuda_create_event(gpu_indexes[0]);
|
||||
event_broadcast = cuda_create_event(gpu_indexes[0]);
|
||||
|
||||
event_scatter_out =
|
||||
(cudaEvent_t *)malloc(active_gpu_count * sizeof(cudaEvent_t));
|
||||
for (int i = 0; i < active_gpu_count; i++) {
|
||||
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
|
||||
}
|
||||
for (int i = 0; i < active_gpu_count; i++) {
|
||||
event_scatter_out[i] = cuda_create_event(gpu_indexes[i]);
|
||||
}
|
||||
// Allocate LUT
|
||||
// LUT is used as a trivial encryption and must be initialized outside
|
||||
@@ -618,6 +614,8 @@ template <typename Torus> struct int_radix_lut {
|
||||
|
||||
lut_vec.push_back(lut);
|
||||
lut_indexes_vec.push_back(lut_indexes);
|
||||
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
|
||||
@@ -668,9 +666,11 @@ template <typename Torus> struct int_radix_lut {
|
||||
multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_trivial_indexes_vec, num_radix_blocks,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
multi_gpu_copy_array_from_cpu_async(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
|
||||
h_lwe_indexes_in, num_radix_blocks, allocate_gpu_memory);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_trivial_indexes_vec, lwe_trivial_indexes,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
|
||||
// Keyswitch
|
||||
tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -727,98 +727,46 @@ template <typename Torus> struct int_radix_lut {
|
||||
|
||||
// Broadcast luts from device gpu_indexes[0] to all active gpus
|
||||
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes) {
|
||||
// We only do broadcast if there are more than 1 active GPU
|
||||
if (active_gpu_count > 1) {
|
||||
int active_device = cuda_get_device();
|
||||
int active_device = cuda_get_device();
|
||||
|
||||
uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
|
||||
uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
|
||||
|
||||
auto src_lut = lut_vec[0];
|
||||
auto src_lut_indexes = lut_indexes_vec[0];
|
||||
auto src_lut = lut_vec[0];
|
||||
auto src_lut_indexes = lut_indexes_vec[0];
|
||||
|
||||
cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
if (gpu_indexes[i] != gpu_indexes[0]) {
|
||||
cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
|
||||
auto dst_lut = lut_vec[i];
|
||||
auto dst_lut_indexes = lut_indexes_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], gpu_memory_allocated);
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
// Ensure the device set at the end of this method is the same as it was
|
||||
// set at the beginning
|
||||
cuda_set_device(active_device);
|
||||
}
|
||||
}
|
||||
// Broadcast luts from device gpu_indexes[0] to all active gpus
|
||||
void broadcast_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t new_active_gpu_count,
|
||||
bool broadcast_lut_values = true) {
|
||||
// We only do broadcast if there are more than 1 active GPU
|
||||
if (new_active_gpu_count > 1) {
|
||||
int active_device = cuda_get_device();
|
||||
|
||||
uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
|
||||
|
||||
auto src_lut = lut_vec[0];
|
||||
auto src_lut_indexes = lut_indexes_vec[0];
|
||||
if (active_gpu_count > 1)
|
||||
cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < new_active_gpu_count; i++) {
|
||||
if (gpu_indexes[i] != gpu_indexes[0]) {
|
||||
cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
|
||||
if (broadcast_lut_values) {
|
||||
auto dst_lut = lut_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
auto dst_lut_indexes = lut_indexes_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
// Ensure the device set at the end of this method is the same as it was
|
||||
// set at the beginning
|
||||
cuda_set_device(active_device);
|
||||
}
|
||||
}
|
||||
|
||||
void allocate_lwe_vector_for_non_trivial_indexes(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t active_gpu_count, uint64_t max_num_radix_blocks,
|
||||
uint64_t &size_tracker, bool allocate_gpu_memory) {
|
||||
// We need to create the auxiliary array only in GPU 0
|
||||
lwe_aligned_vec.resize(active_gpu_count);
|
||||
cuda_event_record(event_broadcast, streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
uint64_t size_tracker_on_array_i = 0;
|
||||
auto inputs_on_gpu = std::max(
|
||||
THRESHOLD_MULTI_GPU,
|
||||
get_num_inputs_on_gpu(max_num_radix_blocks, i, active_gpu_count));
|
||||
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], size_tracker_on_array_i,
|
||||
allocate_gpu_memory);
|
||||
lwe_aligned_vec[i] = d_array;
|
||||
size_tracker += size_tracker_on_array_i;
|
||||
if (gpu_indexes[i] != gpu_indexes[0]) {
|
||||
cuda_stream_wait_event(streams[i], event_broadcast, gpu_indexes[i]);
|
||||
auto dst_lut = lut_vec[i];
|
||||
auto dst_lut_indexes = lut_indexes_vec[i];
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut, src_lut, num_luts * lut_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], gpu_memory_allocated);
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
// Ensure the device set at the end of this method is the same as it was set
|
||||
// at the beginning
|
||||
cuda_set_device(active_device);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
PUSH_RANGE("Release gpu mem lut")
|
||||
free(this->gpu_indexes);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_drop_with_size_tracking_async(lut_vec[i], streams[i], gpu_indexes[i],
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(lut_indexes_vec[i], streams[i],
|
||||
gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
|
||||
}
|
||||
cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
|
||||
cuda_event_destroy(event_broadcast, gpu_indexes[0]);
|
||||
cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(lwe_indexes_out, streams[0],
|
||||
@@ -826,6 +774,12 @@ template <typename Torus> struct int_radix_lut {
|
||||
cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
lut_vec.clear();
|
||||
lut_indexes_vec.clear();
|
||||
free(h_lwe_indexes_in);
|
||||
free(h_lwe_indexes_out);
|
||||
|
||||
if (!mem_reuse) {
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
tmp_lwe_before_ks, gpu_memory_allocated);
|
||||
@@ -842,59 +796,27 @@ template <typename Torus> struct int_radix_lut {
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown PBS type. ")
|
||||
}
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
delete tmp_lwe_before_ks;
|
||||
buffer.clear();
|
||||
|
||||
if (gpu_memory_allocated) {
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
|
||||
for (uint i = 0; i < active_gpu_count; i++)
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
if (active_gpu_count > 1) {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_event_destroy(event_scatter_out[i], gpu_indexes[i]);
|
||||
}
|
||||
cuda_event_destroy(event_scatter_in, gpu_indexes[0]);
|
||||
cuda_event_destroy(event_broadcast, gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
if (lwe_aligned_vec.size() > 0) {
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_drop_with_size_tracking_async(lwe_aligned_vec[i], streams[0],
|
||||
gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
POP_RANGE()
|
||||
PUSH_RANGE("Free cpu mem lut")
|
||||
if (!mem_reuse) {
|
||||
for (uint i = 0; i < active_gpu_count; i++)
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
delete tmp_lwe_before_ks;
|
||||
buffer.clear();
|
||||
|
||||
lwe_array_in_vec.clear();
|
||||
lwe_after_ks_vec.clear();
|
||||
lwe_after_pbs_vec.clear();
|
||||
lwe_trivial_indexes_vec.clear();
|
||||
|
||||
if (active_gpu_count > 1)
|
||||
free(event_scatter_out);
|
||||
}
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
if (lwe_aligned_vec.size() > 0) {
|
||||
lwe_aligned_vec.clear();
|
||||
}
|
||||
lut_vec.clear();
|
||||
lut_indexes_vec.clear();
|
||||
free(h_lwe_indexes_in);
|
||||
free(h_lwe_indexes_out);
|
||||
|
||||
free(h_lut_indexes);
|
||||
free(degrees);
|
||||
free(max_degrees);
|
||||
free(this->gpu_indexes);
|
||||
POP_RANGE()
|
||||
}
|
||||
};
|
||||
|
||||
@@ -939,8 +861,6 @@ template <typename InputTorus> struct int_noise_squashing_lut {
|
||||
|
||||
bool using_trivial_lwe_indexes = true;
|
||||
bool gpu_memory_allocated;
|
||||
std::vector<InputTorus *> lwe_aligned_scatter_vec;
|
||||
std::vector<__uint128_t *> lwe_aligned_gather_vec;
|
||||
// noise squashing constructor
|
||||
int_noise_squashing_lut(cudaStream_t const *streams,
|
||||
uint32_t const *input_gpu_indexes, uint32_t gpu_count,
|
||||
@@ -1082,10 +1002,7 @@ template <typename InputTorus> struct int_noise_squashing_lut {
|
||||
&pbs_buffer[i]);
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
if (lwe_aligned_gather_vec.size() > 0) {
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_gather_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_aligned_scatter_vec);
|
||||
}
|
||||
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
|
||||
@@ -1148,10 +1065,7 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
lut->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
num_radix_blocks * bits_per_block * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
/**
|
||||
* the input indexes should take the first bits_per_block PBS to target
|
||||
@@ -1177,9 +1091,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
|
||||
lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
|
||||
h_lwe_indexes_out);
|
||||
lut->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
streams, gpu_indexes, active_gpu_count,
|
||||
num_radix_blocks * bits_per_block, size_tracker, allocate_gpu_memory);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
free(h_lwe_indexes_in);
|
||||
@@ -1321,9 +1232,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
mux_lut->get_degree(0), mux_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, mux_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count_mux =
|
||||
get_active_gpu_count(bits_per_block * num_radix_blocks, gpu_count);
|
||||
mux_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_mux);
|
||||
mux_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
auto cleaning_lut_f = [params](Torus x) -> Torus {
|
||||
return x % params.message_modulus;
|
||||
@@ -1333,10 +1242,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
|
||||
cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count_cleaning =
|
||||
get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
cleaning_lut->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count_cleaning);
|
||||
cleaning_lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -1425,8 +1331,8 @@ template <typename Torus> struct int_fullprop_buffer {
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
auto active_gpu_count = get_active_gpu_count(2, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
@@ -1561,11 +1467,9 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_message_acc, gpu_memory_allocated);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
message_acc->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
luts_array->broadcast_lut(streams, gpu_indexes);
|
||||
luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes);
|
||||
message_acc->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -1674,8 +1578,9 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
uint32_t total_messages = 0;
|
||||
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
||||
_needs_processing);
|
||||
uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
|
||||
|
||||
if (!mem_reuse) {
|
||||
uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
|
||||
if (total_ciphertexts > 0 ||
|
||||
reduce_degrees_for_single_carry_propagation) {
|
||||
uint64_t size_tracker = 0;
|
||||
@@ -1683,11 +1588,6 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
|
||||
pbs_count, true, size_tracker);
|
||||
allocated_luts_message_carry = true;
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(this->max_total_blocks_in_vec, gpu_count);
|
||||
luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
|
||||
size_tracker, true);
|
||||
}
|
||||
}
|
||||
if (allocated_luts_message_carry) {
|
||||
@@ -1715,9 +1615,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, gpu_memory_allocated);
|
||||
auto active_gpu_count_mc = get_active_gpu_count(pbs_count, gpu_count);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count_mc);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
int_sum_ciphertexts_vec_memory(
|
||||
@@ -1785,9 +1683,7 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
this->current_blocks = current_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
this->luts_message_carry = reused_lut;
|
||||
this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
|
||||
size_tracker, allocate_gpu_memory);
|
||||
|
||||
setup_index_buffers(streams, gpu_indexes, size_tracker);
|
||||
}
|
||||
|
||||
@@ -1871,9 +1767,8 @@ template <typename Torus> struct int_seq_group_prop_memory {
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
auto active_gpu_count = get_active_gpu_count(num_seq_luts, gpu_count);
|
||||
lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
|
||||
lut_sequential_algorithm->broadcast_lut(streams, gpu_indexes);
|
||||
free(h_seq_lut_indexes);
|
||||
};
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -1928,8 +1823,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
|
||||
lut_hillis_steele->get_degree(0), lut_hillis_steele->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_lut_hillis_steele, gpu_memory_allocated);
|
||||
auto active_gpu_count = get_active_gpu_count(num_groups, gpu_count);
|
||||
lut_hillis_steele->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
lut_hillis_steele->broadcast_lut(streams, gpu_indexes);
|
||||
};
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
@@ -2105,9 +2000,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
||||
lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
// Do I need to do something else for the multi-gpu?
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
|
||||
};
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
@@ -2368,9 +2262,7 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
scalar_array_cum_sum, h_scalar_array_cum_sum,
|
||||
num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
luts_array_second_step->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
luts_array_second_step->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
if (use_sequential_algorithm_to_resolve_group_carries) {
|
||||
|
||||
@@ -2389,17 +2281,14 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
|
||||
|
||||
// needed for the division to update the lut indexes
|
||||
void update_lut_indexes(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *new_lut_indexes, Torus *new_scalars,
|
||||
uint32_t new_num_blocks) {
|
||||
uint32_t const *gpu_indexes, Torus *new_lut_indexes,
|
||||
Torus *new_scalars, uint32_t new_num_blocks) {
|
||||
Torus *lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], gpu_memory_allocated);
|
||||
auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
|
||||
// We just need to update the lut indexes so we use false here
|
||||
luts_array_second_step->broadcast_lut(streams, gpu_indexes,
|
||||
new_active_gpu_count, false);
|
||||
|
||||
luts_array_second_step->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
scalar_array_cum_sum, new_scalars, new_num_blocks * sizeof(Torus),
|
||||
@@ -2564,9 +2453,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
|
||||
gpu_memory_allocated);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
// For the final cleanup in case of overflow or carry (it seems that I can)
|
||||
@@ -2635,9 +2522,7 @@ template <typename Torus> struct int_sc_prop_memory {
|
||||
(num_radix_blocks + 1) * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
}
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(num_radix_blocks + 1, gpu_count);
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes);
|
||||
};
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -2832,23 +2717,19 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
|
||||
lut_indexes, h_lut_indexes, lut_indexes_size, streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
// Do I need to do something else for the multi-gpu?
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
|
||||
};
|
||||
|
||||
// needed for the division to update the lut indexes
|
||||
void update_lut_indexes(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *new_lut_indexes, uint32_t new_num_blocks) {
|
||||
uint32_t const *gpu_indexes, Torus *new_lut_indexes,
|
||||
uint32_t new_num_blocks) {
|
||||
Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], gpu_memory_allocated);
|
||||
auto new_active_gpu_count = get_active_gpu_count(new_num_blocks, gpu_count);
|
||||
// We just need to update the lut indexes so we use false here
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes,
|
||||
new_active_gpu_count, false);
|
||||
luts_array_first_step->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
@@ -2947,9 +2828,8 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, f_message_extract,
|
||||
gpu_memory_allocated);
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut_message_extract->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
if (compute_overflow) {
|
||||
lut_borrow_flag = new int_radix_lut<Torus>(
|
||||
@@ -2965,7 +2845,8 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
lut_borrow_flag->get_degree(0), lut_borrow_flag->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
f_borrow_flag, gpu_memory_allocated);
|
||||
lut_borrow_flag->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
lut_borrow_flag->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
@@ -2993,15 +2874,15 @@ template <typename Torus> struct int_borrow_prop_memory {
|
||||
|
||||
// needed for the division to update the lut indexes
|
||||
void update_lut_indexes(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
uint32_t const *gpu_indexes,
|
||||
Torus *first_indexes_for_div,
|
||||
Torus *second_indexes_for_div, Torus *scalars_for_div,
|
||||
uint32_t new_num_blocks) {
|
||||
shifted_blocks_borrow_state_mem->update_lut_indexes(
|
||||
streams, gpu_indexes, gpu_count, first_indexes_for_div, new_num_blocks);
|
||||
streams, gpu_indexes, first_indexes_for_div, new_num_blocks);
|
||||
prop_simu_group_carries_mem->update_lut_indexes(
|
||||
streams, gpu_indexes, gpu_count, second_indexes_for_div,
|
||||
scalars_for_div, new_num_blocks);
|
||||
streams, gpu_indexes, second_indexes_for_div, scalars_for_div,
|
||||
new_num_blocks);
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
@@ -3132,10 +3013,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
zero_out_predicate_lut_f, gpu_memory_allocated);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
zero_out_mem = new int_zero_out_if_buffer<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
@@ -3208,8 +3086,8 @@ template <typename Torus> struct int_mul_memory {
|
||||
streams[0], gpu_indexes[0],
|
||||
luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
|
||||
msb_vector_block_count);
|
||||
auto active_gpu_count = get_active_gpu_count(total_block_count, gpu_count);
|
||||
luts_array->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
luts_array->broadcast_lut(streams, gpu_indexes);
|
||||
// create memory object for sum ciphertexts
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
@@ -3341,8 +3219,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
@@ -3426,8 +3303,7 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
|
||||
cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
shift_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
cur_lut_bivariate->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
lut_buffers_bivariate.push_back(cur_lut_bivariate);
|
||||
}
|
||||
@@ -3531,9 +3407,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
shift_last_block_lut_univariate->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
|
||||
}
|
||||
@@ -3558,9 +3432,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
padding_block_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
lut_buffers_univariate.push_back(padding_block_lut_univariate);
|
||||
|
||||
@@ -3599,9 +3471,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
|
||||
shift_blocks_lut_bivariate->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
blocks_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
|
||||
}
|
||||
@@ -3715,13 +3585,9 @@ template <typename Torus> struct int_cmux_buffer {
|
||||
predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
|
||||
2 * num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
auto active_gpu_count_pred =
|
||||
get_active_gpu_count(2 * num_radix_blocks, gpu_count);
|
||||
predicate_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count_pred);
|
||||
auto active_gpu_count_msg =
|
||||
get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
message_extract_lut->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count_msg);
|
||||
|
||||
predicate_lut->broadcast_lut(streams, gpu_indexes);
|
||||
message_extract_lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -3793,8 +3659,7 @@ template <typename Torus> struct int_are_all_block_true_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_max_value_f, gpu_memory_allocated);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(max_chunks, gpu_count);
|
||||
is_max_value->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
is_max_value->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -3854,8 +3719,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, operator_f, gpu_memory_allocated);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
operator_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
operator_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
// f(x) -> x == 0
|
||||
Torus total_modulus = params.message_modulus * params.carry_modulus;
|
||||
@@ -3873,7 +3737,7 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
|
||||
|
||||
is_non_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
is_non_zero_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
// Scalar may have up to num_radix_blocks blocks
|
||||
scalar_comparison_luts = new int_radix_lut<Torus>(
|
||||
@@ -3892,8 +3756,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f, gpu_memory_allocated);
|
||||
}
|
||||
scalar_comparison_luts->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
|
||||
scalar_comparison_luts->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -3964,8 +3828,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
|
||||
tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
block_selector_f, gpu_memory_allocated);
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -4152,7 +4016,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
identity_lut->get_degree(0), identity_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, identity_lut_f, gpu_memory_allocated);
|
||||
identity_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
identity_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
|
||||
auto is_zero_f = [total_modulus](Torus x) -> Torus {
|
||||
@@ -4169,7 +4034,7 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, is_zero_f, gpu_memory_allocated);
|
||||
|
||||
is_zero_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
is_zero_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
switch (op) {
|
||||
case COMPARISON_TYPE::MAX:
|
||||
@@ -4251,8 +4116,8 @@ template <typename Torus> struct int_comparison_buffer {
|
||||
signed_lut->get_degree(0), signed_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, signed_lut_f, gpu_memory_allocated);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
signed_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
signed_lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4467,23 +4332,17 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
streams, gpu_indexes, gpu_count, params, 1, num_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], masking_luts_1[i]->get_lut(0, 0),
|
||||
masking_luts_1[i]->get_degree(0),
|
||||
masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_masking, gpu_memory_allocated);
|
||||
auto active_gpu_count1 = get_active_gpu_count(1, gpu_count);
|
||||
masking_luts_1[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count1);
|
||||
int_radix_lut<Torus> *luts[2] = {masking_luts_1[i], masking_luts_2[i]};
|
||||
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], masking_luts_2[i]->get_lut(0, 0),
|
||||
masking_luts_2[i]->get_degree(0),
|
||||
masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
lut_f_masking, gpu_memory_allocated);
|
||||
auto active_gpu_count2 = get_active_gpu_count(num_blocks, gpu_count);
|
||||
masking_luts_2[i]->broadcast_lut(streams, gpu_indexes, active_gpu_count2);
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size,
|
||||
params.message_modulus, params.carry_modulus, lut_f_masking,
|
||||
gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
|
||||
// create and generate message_extract_lut_1 and message_extract_lut_2
|
||||
@@ -4503,14 +4362,13 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
|
||||
int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
|
||||
message_extract_lut_2};
|
||||
auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
|
||||
for (int j = 0; j < 2; j++) {
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0),
|
||||
luts[j]->get_degree(0), luts[j]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
luts[j]->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
// Give name to closures to improve readability
|
||||
@@ -4546,8 +4404,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cur_lut_f, params.message_modulus - 2,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes);
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0),
|
||||
@@ -4556,8 +4413,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, cur_lut_f, params.message_modulus - 1,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
// create and generate zero_out_if_overflow_happened
|
||||
zero_out_if_overflow_happened = new int_radix_lut<Torus> *[2];
|
||||
@@ -4584,8 +4440,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, overflow_happened_f, params.message_modulus - 2,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes);
|
||||
generate_device_accumulator_bivariate_with_factor<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
zero_out_if_overflow_happened[1]->get_lut(0, 0),
|
||||
@@ -4594,12 +4449,10 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, overflow_happened_f, params.message_modulus - 1,
|
||||
gpu_memory_allocated);
|
||||
zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
// merge_overflow_flags_luts
|
||||
merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
|
||||
auto active_gpu_count_for_bits = get_active_gpu_count(1, gpu_count);
|
||||
for (int i = 0; i < num_bits_in_message; i++) {
|
||||
auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
|
||||
return (x == 0 && y == 0) << i;
|
||||
@@ -4616,8 +4469,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
|
||||
merge_overflow_flags_luts[i]->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f_bit, gpu_memory_allocated);
|
||||
merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count_for_bits);
|
||||
merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4934,7 +4786,7 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->op = op;
|
||||
this->params = params;
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
|
||||
switch (op) {
|
||||
case BITAND:
|
||||
case BITOR:
|
||||
@@ -4961,7 +4813,7 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_bivariate_f, gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@@ -4991,7 +4843,7 @@ template <typename Torus> struct int_bitop_buffer {
|
||||
params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_univariate_scalar_f,
|
||||
gpu_memory_allocated);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -5276,10 +5128,7 @@ template <typename Torus> struct int_div_rem_memory {
|
||||
compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
||||
f_compare_extracted_signed_bits, gpu_memory_allocated);
|
||||
auto active_gpu_count_cmp =
|
||||
get_active_gpu_count(1, gpu_count); // only 1 block needed
|
||||
compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count_cmp);
|
||||
compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5949,7 +5798,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
||||
this->allocate_gpu_memory = allocate_gpu_memory;
|
||||
this->direction = direction;
|
||||
this->bit_value = bit_value;
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
|
||||
this->univ_lut_mem = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 1, num_radix_blocks,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
@@ -5988,7 +5837,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
||||
params.carry_modulus, generate_uni_lut_lambda, allocate_gpu_memory);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
univ_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
univ_lut_mem->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
auto generate_bi_lut_lambda =
|
||||
@@ -6007,7 +5856,7 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
|
||||
params.carry_modulus, generate_bi_lut_lambda, allocate_gpu_memory);
|
||||
|
||||
if (allocate_gpu_memory) {
|
||||
biv_lut_mem->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
biv_lut_mem->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
|
||||
this->tmp_ct = new CudaRadixCiphertextFFI;
|
||||
@@ -6225,8 +6074,7 @@ template <typename Torus> struct int_grouped_oprf_memory {
|
||||
cuda_memcpy_async_to_gpu(luts->get_lut_indexes(0, 0), this->h_lut_indexes,
|
||||
num_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
auto active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
|
||||
luts->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
luts->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
free(h_corrections);
|
||||
}
|
||||
|
||||
@@ -232,13 +232,8 @@ template <typename Torus> struct zk_expand_mem {
|
||||
num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
|
||||
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count);
|
||||
message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
|
||||
streams, gpu_indexes, active_gpu_count, 2 * num_lwes, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
// The expanded LWEs will always be on the casting key format
|
||||
tmp_expanded_lwes = (Torus *)cuda_malloc_with_size_tracking_async(
|
||||
num_lwes * (casting_params.big_lwe_dimension + 1) * sizeof(Torus),
|
||||
|
||||
@@ -148,8 +148,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
|
||||
h_lut_indexes, num_chunks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
auto active_gpu_count = get_active_gpu_count(num_chunks, gpu_count);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
|
||||
}
|
||||
lut = is_max_value_lut;
|
||||
}
|
||||
@@ -168,10 +167,7 @@ __host__ void are_all_comparisons_block_true(
|
||||
is_max_value_lut->h_lut_indexes,
|
||||
is_max_value_lut->num_blocks * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0]);
|
||||
auto active_gpu_count_is_max =
|
||||
get_active_gpu_count(is_max_value_lut->num_blocks, gpu_count);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes,
|
||||
active_gpu_count_is_max, false);
|
||||
is_max_value_lut->broadcast_lut(streams, gpu_indexes);
|
||||
reset_radix_ciphertext_blocks(lwe_array_out, 1);
|
||||
return;
|
||||
} else {
|
||||
@@ -503,9 +499,7 @@ __host__ void tree_sign_reduction(
|
||||
streams[0], gpu_indexes[0], last_lut->get_lut(0, 0),
|
||||
last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension,
|
||||
polynomial_size, message_modulus, carry_modulus, f, true);
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
last_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
last_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
// Last leaf
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
|
||||
@@ -363,17 +363,14 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
lut->lwe_trivial_indexes_vec;
|
||||
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[j], lut->event_scatter_in,
|
||||
gpu_indexes[j]);
|
||||
}
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
extracted_lwe, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, lut->active_gpu_count, num_blocks_to_decompress,
|
||||
lut->active_gpu_count, num_blocks_to_decompress,
|
||||
compression_params.small_lwe_dimension + 1);
|
||||
|
||||
/// Apply PBS
|
||||
@@ -392,19 +389,12 @@ host_integer_decompress(cudaStream_t const *streams,
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)d_lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
num_blocks_to_decompress, encryption_params.big_lwe_dimension + 1);
|
||||
lut->using_trivial_lwe_indexes, num_blocks_to_decompress,
|
||||
encryption_params.big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams[j],
|
||||
gpu_indexes[j]);
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
|
||||
gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -311,8 +311,8 @@ __host__ void host_unsigned_integer_div_rem_kb(
|
||||
mem_ptr->scalars_for_overflow_sub
|
||||
[merged_interesting_remainder->num_radix_blocks - 1];
|
||||
mem_ptr->overflow_sub_mem->update_lut_indexes(
|
||||
streams, gpu_indexes, gpu_count, first_indexes, second_indexes,
|
||||
scalar_indexes, merged_interesting_remainder->num_radix_blocks);
|
||||
streams, gpu_indexes, first_indexes, second_indexes, scalar_indexes,
|
||||
merged_interesting_remainder->num_radix_blocks);
|
||||
host_integer_overflowing_sub<uint64_t>(
|
||||
streams, gpu_indexes, gpu_count, new_remainder,
|
||||
merged_interesting_remainder, interesting_divisor,
|
||||
|
||||
@@ -567,6 +567,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
grouping_factor, num_radix_blocks, pbs_type, num_many_lut, lut_stride);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
|
||||
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
|
||||
@@ -578,8 +579,8 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
|
||||
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
@@ -603,7 +604,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)lwe_array_out->ptr,
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
@@ -693,8 +694,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table_kb(
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(Torus *)lwe_array_in->ptr, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
|
||||
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
@@ -822,8 +823,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(Torus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_vec,
|
||||
lut->active_gpu_count, num_radix_blocks, big_lwe_dimension + 1);
|
||||
lut->using_trivial_lwe_indexes, lut->active_gpu_count, num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
|
||||
@@ -847,7 +848,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)(lwe_array_out->ptr),
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_radix_blocks, big_lwe_dimension + 1);
|
||||
num_radix_blocks, big_lwe_dimension + 1);
|
||||
POP_RANGE()
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
@@ -1693,7 +1694,6 @@ __host__ void reduce_signs(
|
||||
"than the number of blocks to operate on")
|
||||
|
||||
auto diff_buffer = mem_ptr->diff_buffer;
|
||||
auto active_gpu_count = mem_ptr->active_gpu_count;
|
||||
|
||||
auto params = mem_ptr->params;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
@@ -1723,7 +1723,7 @@ __host__ void reduce_signs(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, reduce_two_orderings_function, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
while (num_sign_blocks > 2) {
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
@@ -1754,7 +1754,7 @@ __host__ void reduce_signs(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, final_lut_f, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
pack_blocks<Torus>(streams[0], gpu_indexes[0], signs_b, signs_a,
|
||||
num_sign_blocks, message_modulus);
|
||||
@@ -1774,7 +1774,7 @@ __host__ void reduce_signs(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, final_lut_f, true);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks,
|
||||
@@ -1800,8 +1800,7 @@ uint64_t scratch_cuda_apply_univariate_lut_kb(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
@@ -1838,8 +1837,7 @@ uint64_t scratch_cuda_apply_many_univariate_lut_kb(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
@@ -1876,8 +1874,7 @@ uint64_t scratch_cuda_apply_bivariate_lut_kb(
|
||||
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
*(*mem_ptr)->get_degree(0) = lut_degree;
|
||||
auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
(*mem_ptr)->broadcast_lut(streams, gpu_indexes);
|
||||
POP_RANGE()
|
||||
return size_tracker;
|
||||
}
|
||||
@@ -2393,9 +2390,8 @@ __host__ void integer_radix_apply_noise_squashing_kb(
|
||||
multi_gpu_scatter_lwe_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(InputTorus *)lwe_array_pbs_in->ptr, lut->lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_scatter_vec,
|
||||
lut->active_gpu_count, lwe_array_out->num_radix_blocks,
|
||||
lut->input_big_lwe_dimension + 1);
|
||||
lut->using_trivial_lwe_indexes, lut->active_gpu_count,
|
||||
lwe_array_out->num_radix_blocks, lut->input_big_lwe_dimension + 1);
|
||||
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
|
||||
@@ -2418,8 +2414,8 @@ __host__ void integer_radix_apply_noise_squashing_kb(
|
||||
multi_gpu_gather_lwe_async<__uint128_t>(
|
||||
streams, gpu_indexes, active_gpu_count,
|
||||
(__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec, nullptr,
|
||||
lut->using_trivial_lwe_indexes, lut->lwe_aligned_gather_vec,
|
||||
lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
|
||||
lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
|
||||
big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
|
||||
@@ -416,9 +416,7 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
lut_stride);
|
||||
} else {
|
||||
|
||||
// we just need to broadcast the indexes
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
|
||||
false);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
@@ -470,9 +468,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
lut_stride);
|
||||
} else {
|
||||
uint32_t num_blocks_in_apply_lut = 2 * num_radix_blocks;
|
||||
// we just need to broadcast the indexes
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, active_gpu_count,
|
||||
false);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes);
|
||||
luts_message_carry->using_trivial_lwe_indexes = false;
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
|
||||
@@ -48,10 +48,7 @@ void host_integer_grouped_oprf(
|
||||
std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
|
||||
|
||||
cuda_event_record(lut->event_scatter_in, streams[0], gpu_indexes[0]);
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[j], lut->event_scatter_in, gpu_indexes[j]);
|
||||
}
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
if (!lut->using_trivial_lwe_indexes) {
|
||||
PANIC("lut->using_trivial_lwe_indexes should be true");
|
||||
@@ -60,7 +57,7 @@ void host_integer_grouped_oprf(
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
seeded_lwe_input, lut->lwe_indexes_in, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, active_gpu_count, num_blocks_to_process,
|
||||
active_gpu_count, num_blocks_to_process,
|
||||
mem_ptr->params.small_lwe_dimension + 1);
|
||||
|
||||
execute_pbs_async<Torus, Torus>(
|
||||
@@ -76,17 +73,10 @@ void host_integer_grouped_oprf(
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)radix_lwe_out->ptr,
|
||||
lwe_after_pbs_vec, lut->lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lut->lwe_aligned_vec, num_blocks_to_process,
|
||||
mem_ptr->params.big_lwe_dimension + 1);
|
||||
num_blocks_to_process, mem_ptr->params.big_lwe_dimension + 1);
|
||||
|
||||
// other gpus record their events
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_event_record(lut->event_scatter_out[j], streams[j], gpu_indexes[j]);
|
||||
}
|
||||
// GPU 0 waits for all
|
||||
for (int j = 1; j < active_gpu_count; j++) {
|
||||
cuda_stream_wait_event(streams[0], lut->event_scatter_out[j],
|
||||
gpu_indexes[0]);
|
||||
for (uint32_t i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,8 +47,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
|
||||
cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
|
||||
num_clear_blocks * sizeof(Torus), streams[0],
|
||||
gpu_indexes[0]);
|
||||
auto active_gpu_count = get_active_gpu_count(num_clear_blocks, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count, false);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, output, input, bsks, ksks,
|
||||
|
||||
@@ -154,8 +154,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_last_leaf_lut_f, true);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out,
|
||||
@@ -254,8 +253,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_lsb_out,
|
||||
@@ -288,8 +286,8 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
|
||||
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, one_block_lut_f, true);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
|
||||
@@ -436,8 +434,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0),
|
||||
lut->get_max_degree(0), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f, true);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, are_all_msb_zeros,
|
||||
@@ -543,8 +540,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, lut_f, true);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
signed_msb_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
CudaRadixCiphertextFFI sign_block;
|
||||
as_radix_ciphertext_slice<Torus>(
|
||||
@@ -592,8 +588,8 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
|
||||
one_block_lut->get_degree(0), one_block_lut->get_max_degree(0),
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, one_block_lut_f, true);
|
||||
auto active_gpu_count = get_active_gpu_count(1, gpu_count);
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes, active_gpu_count);
|
||||
|
||||
one_block_lut->broadcast_lut(streams, gpu_indexes);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks,
|
||||
@@ -823,11 +819,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
|
||||
num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0],
|
||||
gpu_indexes[0]);
|
||||
}
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(num_halved_scalar_blocks, gpu_count);
|
||||
// We use false cause we only will broadcast the indexes
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes,
|
||||
active_gpu_count, false);
|
||||
scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes);
|
||||
|
||||
integer_radix_apply_univariate_lookup_table_kb<Torus>(
|
||||
lsb_streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
|
||||
|
||||
@@ -38,19 +38,6 @@ void multi_gpu_copy_array_async(cudaStream_t const *streams,
|
||||
gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
/// Copy an array residing on one CPU to all active gpus
|
||||
template <typename Torus>
|
||||
void multi_gpu_copy_array_from_cpu_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest, Torus const *h_src,
|
||||
uint32_t elements_per_gpu, bool gpu_memory_allocated) {
|
||||
dest.resize(gpu_count);
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
dest[i], h_src, elements_per_gpu * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], gpu_memory_allocated);
|
||||
}
|
||||
}
|
||||
/// Allocates the input/output vector for all devices
|
||||
/// Initializes also the related indexing and initializes it to the trivial
|
||||
/// index
|
||||
@@ -106,13 +93,10 @@ void multi_gpu_alloc_lwe_many_lut_output_async(
|
||||
}
|
||||
}
|
||||
|
||||
// This function reads lwes using the indexes and place them in a single aligned
|
||||
// array. This function is needed before communication to perform a single
|
||||
// contiguous data movement. Each block handles one lwe.
|
||||
// Each block handles one lwe
|
||||
template <typename Torus>
|
||||
__global__ void align_with_indexes(Torus *d_packed_vector,
|
||||
Torus const *d_vector,
|
||||
Torus const *d_indexes, int lwe_size) {
|
||||
__global__ void pack_data(Torus *d_packed_vector, Torus const *d_vector,
|
||||
Torus const *d_indexes, int lwe_size) {
|
||||
|
||||
int output_offset = blockIdx.x * lwe_size;
|
||||
int input_offset = d_indexes[blockIdx.x] * lwe_size;
|
||||
@@ -121,12 +105,10 @@ __global__ void align_with_indexes(Torus *d_packed_vector,
|
||||
}
|
||||
}
|
||||
|
||||
// This function takes the aligned array after communication and places it in
|
||||
// the corresponding indexes. Each block handles one lwe.
|
||||
// Each block handles one lwe
|
||||
template <typename Torus>
|
||||
__global__ void realign_with_indexes(Torus *d_vector,
|
||||
Torus const *d_packed_vector,
|
||||
Torus const *d_indexes, int lwe_size) {
|
||||
__global__ void unpack_data(Torus *d_vector, Torus const *d_packed_vector,
|
||||
Torus const *d_indexes, int lwe_size) {
|
||||
|
||||
int input_offset = blockIdx.x * lwe_size;
|
||||
int output_offset = d_indexes[blockIdx.x] * lwe_size;
|
||||
@@ -146,7 +128,6 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t gpu_count, std::vector<Torus *> &dest,
|
||||
Torus const *src, Torus const *d_src_indexes,
|
||||
bool is_trivial_index,
|
||||
std::vector<Torus *> &aligned_vec,
|
||||
uint32_t max_active_gpu_count,
|
||||
uint32_t num_inputs, uint32_t lwe_size) {
|
||||
|
||||
@@ -169,28 +150,28 @@ void multi_gpu_scatter_lwe_async(cudaStream_t const *streams,
|
||||
gpu_indexes[i], true);
|
||||
|
||||
} else {
|
||||
if (aligned_vec.size() == 0)
|
||||
PANIC("Cuda error: auxiliary arrays should be setup!");
|
||||
|
||||
if (d_src_indexes == nullptr)
|
||||
PANIC("Cuda error: source indexes should be initialized!");
|
||||
Torus *d_packed_vector = (Torus *)cuda_malloc_async(
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
align_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
|
||||
aligned_vec[i], (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
|
||||
|
||||
pack_data<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
|
||||
d_packed_vector, (Torus *)src, (Torus *)d_src_indexes + gpu_offset,
|
||||
lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
|
||||
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
dest[i], aligned_vec[i], inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
dest[i], d_packed_vector, inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], true);
|
||||
|
||||
cudaEvent_t temp_event = cuda_create_event(gpu_indexes[i]);
|
||||
cuda_event_record(temp_event, streams[i], gpu_indexes[i]);
|
||||
cuda_stream_wait_event(streams[0], temp_event, gpu_indexes[0]);
|
||||
cuda_drop_async(d_packed_vector, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -203,7 +184,6 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
Torus *dest, const std::vector<Torus *> &src,
|
||||
Torus *d_dest_indexes, bool is_trivial_index,
|
||||
std::vector<Torus *> &aligned_vec,
|
||||
uint32_t num_inputs, uint32_t lwe_size) {
|
||||
|
||||
for (uint i = 0; i < gpu_count; i++) {
|
||||
@@ -221,27 +201,30 @@ void multi_gpu_gather_lwe_async(cudaStream_t const *streams,
|
||||
d_dest, d_src, inputs_on_gpu * lwe_size * sizeof(Torus), streams[i],
|
||||
gpu_indexes[i], true);
|
||||
} else {
|
||||
if (aligned_vec.size() == 0)
|
||||
PANIC("Cuda error: auxiliary arrays should be setup!");
|
||||
if (d_dest_indexes == nullptr)
|
||||
PANIC("Cuda error: destination indexes should be initialized!");
|
||||
|
||||
Torus *d_packed_vector = (Torus *)cuda_malloc_async(
|
||||
inputs_on_gpu * lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
cudaEvent_t temp_event2 = cuda_create_event(gpu_indexes[0]);
|
||||
|
||||
cuda_event_record(temp_event2, streams[0], gpu_indexes[0]);
|
||||
cuda_stream_wait_event(streams[i], temp_event2, gpu_indexes[i]);
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
||||
aligned_vec[i], src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
d_packed_vector, src[i], inputs_on_gpu * lwe_size * sizeof(Torus),
|
||||
streams[i], gpu_indexes[i], true);
|
||||
|
||||
cudaEvent_t temp_event3 = cuda_create_event(gpu_indexes[i]);
|
||||
|
||||
cuda_event_record(temp_event3, streams[i], gpu_indexes[i]);
|
||||
cuda_stream_wait_event(streams[0], temp_event3, gpu_indexes[0]);
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
realign_with_indexes<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
|
||||
dest, aligned_vec[i], (Torus *)d_dest_indexes + gpu_offset, lwe_size);
|
||||
|
||||
unpack_data<Torus><<<inputs_on_gpu, 1024, 0, streams[0]>>>(
|
||||
dest, d_packed_vector, (Torus *)d_dest_indexes + gpu_offset,
|
||||
lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
cuda_drop_async(d_packed_vector, streams[0], gpu_indexes[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user