refactor(gpu): creating InternalCudaStreams to improve the management of multiple streams per GPU

This commit is contained in:
Enzo Di Maria
2025-11-27 17:00:27 +01:00
committed by Agnès Leroy
parent 182aad99f1
commit 5273f61593
5 changed files with 357 additions and 653 deletions

View File

@@ -183,6 +183,214 @@ public:
}
};
struct InternalCudaStreams {
private:
CudaStreams *_internal_cuda_streams;
uint32_t _num_internal_cuda_streams;
uint32_t _num_gpus;
cudaEvent_t _incoming_event;
cudaEvent_t *_outgoing_events;
InternalCudaStreams(const InternalCudaStreams &) = delete;
InternalCudaStreams &operator=(const InternalCudaStreams &) = delete;
public:
InternalCudaStreams() {
_internal_cuda_streams = nullptr;
_incoming_event = nullptr;
_outgoing_events = nullptr;
_num_internal_cuda_streams = 0;
_num_gpus = 0;
}
void create_internal_cuda_streams_on_same_gpus(
const CudaStreams &base_streams, uint32_t num_internal_cuda_streams) {
PANIC_IF_FALSE(_internal_cuda_streams == nullptr,
"InternalCudaStreams: object already initialized.");
_num_internal_cuda_streams = num_internal_cuda_streams;
_num_gpus = base_streams.count();
if (num_internal_cuda_streams > 0) {
_internal_cuda_streams = new CudaStreams[num_internal_cuda_streams];
for (uint32_t i = 0; i < num_internal_cuda_streams; ++i) {
_internal_cuda_streams[i].create_on_same_gpus(base_streams);
}
}
if (_num_gpus > 0) {
_incoming_event = cuda_create_event(base_streams.gpu_index(0));
}
uint32_t total_events = num_internal_cuda_streams * _num_gpus;
if (total_events > 0) {
_outgoing_events = new cudaEvent_t[total_events];
for (uint32_t s = 0; s < num_internal_cuda_streams; ++s) {
for (uint32_t g = 0; g < _num_gpus; ++g) {
_outgoing_events[s * _num_gpus + g] =
cuda_create_event(base_streams.gpu_index(g));
}
}
}
}
CudaStreams &operator[](uint32_t idx) const {
PANIC_IF_FALSE(idx < _num_internal_cuda_streams,
"InternalCudaStreams index out of bounds");
return _internal_cuda_streams[idx];
}
uint32_t num_streams() const { return _num_internal_cuda_streams; }
void
internal_streams_wait_for_main_stream_0(const CudaStreams &main_streams) {
PANIC_IF_FALSE(main_streams.gpu_index(0) ==
_internal_cuda_streams[0].gpu_index(0),
"InternalCudaStreams: gpu_index(0) of main_streams should "
"be the same as _internal_cuda_streams[0].");
cuda_event_record(_incoming_event, main_streams.stream(0),
main_streams.gpu_index(0));
for (uint32_t s = 0; s < _num_internal_cuda_streams; ++s) {
for (uint32_t g = 0; g < _num_gpus; ++g) {
cuda_stream_wait_event(_internal_cuda_streams[s].stream(g),
_incoming_event,
_internal_cuda_streams[s].gpu_index(g));
}
}
}
void
internal_streams_slice_wait_for_main_stream_0(const CudaStreams &main_streams,
const uint32_t *stream_indices,
size_t num_indices) {
PANIC_IF_FALSE(main_streams.gpu_index(0) ==
_internal_cuda_streams[0].gpu_index(0),
"InternalCudaStreams: gpu_index(0) of main_streams should "
"be the same as _internal_cuda_streams[0].");
cuda_event_record(_incoming_event, main_streams.stream(0),
main_streams.gpu_index(0));
for (size_t i = 0; i < num_indices; ++i) {
uint32_t s_idx = stream_indices[i];
PANIC_IF_FALSE(s_idx < _num_internal_cuda_streams,
"InternalCudaStreams: stream index out of bounds");
for (uint32_t g = 0; g < _num_gpus; ++g) {
cuda_stream_wait_event(_internal_cuda_streams[s_idx].stream(g),
_incoming_event,
_internal_cuda_streams[s_idx].gpu_index(g));
}
}
}
void
main_stream_0_wait_for_internal_streams(const CudaStreams &main_streams) {
PANIC_IF_FALSE(main_streams.gpu_index(0) ==
_internal_cuda_streams[0].gpu_index(0),
"InternalCudaStreams: gpu_index(0) of main_streams should "
"be the same as _internal_cuda_streams[0].");
for (uint32_t s = 0; s < _num_internal_cuda_streams; ++s) {
for (uint32_t g = 0; g < _num_gpus; ++g) {
cuda_event_record(_outgoing_events[s * _num_gpus + g],
_internal_cuda_streams[s].stream(g),
_internal_cuda_streams[s].gpu_index(g));
}
}
for (uint32_t s = 0; s < _num_internal_cuda_streams; ++s) {
for (uint32_t g = 0; g < _num_gpus; ++g) {
cuda_stream_wait_event(main_streams.stream(0),
_outgoing_events[s * _num_gpus + g],
main_streams.gpu_index(0));
}
}
}
void
main_stream_0_wait_for_internal_streams_slice(const CudaStreams &main_streams,
const uint32_t *stream_indices,
size_t num_indices) {
PANIC_IF_FALSE(main_streams.gpu_index(0) ==
_internal_cuda_streams[0].gpu_index(0),
"InternalCudaStreams: gpu_index(0) of main_streams should "
"be the same as _internal_cuda_streams[0].");
for (size_t i = 0; i < num_indices; ++i) {
uint32_t s_idx = stream_indices[i];
PANIC_IF_FALSE(s_idx < _num_internal_cuda_streams,
"InternalCudaStreams: stream index out of bounds");
for (uint32_t g = 0; g < _num_gpus; ++g) {
cuda_event_record(_outgoing_events[s_idx * _num_gpus + g],
_internal_cuda_streams[s_idx].stream(g),
_internal_cuda_streams[s_idx].gpu_index(g));
}
}
for (size_t i = 0; i < num_indices; ++i) {
uint32_t s_idx = stream_indices[i];
for (uint32_t g = 0; g < _num_gpus; ++g) {
cuda_stream_wait_event(main_streams.stream(0),
_outgoing_events[s_idx * _num_gpus + g],
main_streams.gpu_index(0));
}
}
}
void release(const CudaStreams &main_streams) {
PANIC_IF_FALSE(main_streams.gpu_index(0) ==
_internal_cuda_streams[0].gpu_index(0),
"InternalCudaStreams: gpu_index(0) of main_streams should "
"be the same as _internal_cuda_streams[0].");
cuda_synchronize_stream(main_streams.stream(0), main_streams.gpu_index(0));
if (_outgoing_events && _internal_cuda_streams) {
for (uint32_t s = 0; s < _num_internal_cuda_streams; ++s) {
for (uint32_t g = 0; g < _num_gpus; ++g) {
cuda_event_destroy(_outgoing_events[s * _num_gpus + g],
_internal_cuda_streams[s].gpu_index(g));
}
}
delete[] _outgoing_events;
_outgoing_events = nullptr;
}
if (_incoming_event && _internal_cuda_streams) {
cuda_event_destroy(_incoming_event,
_internal_cuda_streams[0].gpu_index(0));
_incoming_event = nullptr;
}
if (_internal_cuda_streams) {
for (uint32_t i = 0; i < _num_internal_cuda_streams; ++i) {
_internal_cuda_streams[i].release();
}
delete[] _internal_cuda_streams;
_internal_cuda_streams = nullptr;
}
}
~InternalCudaStreams() {
PANIC_IF_FALSE(_internal_cuda_streams == nullptr &&
_incoming_event == nullptr &&
_outgoing_events == nullptr,
"InternalCudaStreams: must call release before destruction");
}
};
struct CudaStreamsBarrier {
private:
std::vector<cudaEvent_t> _events;

View File

@@ -2443,12 +2443,7 @@ template <typename Torus> struct int_borrow_prop_memory {
int_radix_params params;
CudaStreams active_streams;
CudaStreams sub_streams_1;
CudaStreams sub_streams_2;
cudaEvent_t *incoming_events;
cudaEvent_t *outgoing_events1;
cudaEvent_t *outgoing_events2;
InternalCudaStreams internal_streams;
uint32_t compute_overflow;
bool gpu_memory_allocated;
@@ -2524,20 +2519,8 @@ template <typename Torus> struct int_borrow_prop_memory {
}
active_streams = streams.active_gpu_subset(num_radix_blocks);
sub_streams_1.create_on_same_gpus(active_streams);
sub_streams_2.create_on_same_gpus(active_streams);
incoming_events =
(cudaEvent_t *)malloc(active_streams.count() * sizeof(cudaEvent_t));
outgoing_events1 =
(cudaEvent_t *)malloc(active_streams.count() * sizeof(cudaEvent_t));
outgoing_events2 =
(cudaEvent_t *)malloc(active_streams.count() * sizeof(cudaEvent_t));
for (uint j = 0; j < active_streams.count(); j++) {
incoming_events[j] = cuda_create_event(active_streams.gpu_index(j));
outgoing_events1[j] = cuda_create_event(active_streams.gpu_index(j));
outgoing_events2[j] = cuda_create_event(active_streams.gpu_index(j));
}
internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
2);
};
// needed for the division to update the lut indexes
@@ -2564,21 +2547,9 @@ template <typename Torus> struct int_borrow_prop_memory {
delete lut_borrow_flag;
}
// The substreams have to be synchronized before destroying events
internal_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
// release events
for (uint j = 0; j < active_streams.count(); j++) {
cuda_event_destroy(incoming_events[j], active_streams.gpu_index(j));
cuda_event_destroy(outgoing_events1[j], active_streams.gpu_index(j));
cuda_event_destroy(outgoing_events2[j], active_streams.gpu_index(j));
}
free(incoming_events);
free(outgoing_events1);
free(outgoing_events2);
sub_streams_1.release();
sub_streams_2.release();
};
};
std::pair<bool, bool> get_invert_flags(COMPARISON_TYPE compare);

View File

@@ -1,4 +1,6 @@
#pragma once
#include "cast.h"
#include "helper_multi_gpu.h"
#include "integer/comparison.h"
#include "integer/radix_ciphertext.cuh"
#include "integer_utilities.h"
@@ -11,20 +13,18 @@ template <typename Torus> struct int_equality_selectors_buffer {
int_radix_params params;
bool allocate_gpu_memory;
uint32_t lut_stride;
uint32_t num_possible_values;
uint32_t num_possible_values;
int_radix_lut<Torus> *comparison_luts;
CudaRadixCiphertextFFI *tmp_many_luts_output;
CudaRadixCiphertextFFI **tmp_block_comparisons;
int_comparison_buffer<Torus> **reduction_buffers;
CudaStreams active_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
InternalCudaStreams internal_cuda_streams;
uint32_t num_streams;
CudaRadixCiphertextFFI **tmp_block_comparisons;
int_comparison_buffer<Torus> **reduction_buffers;
int_equality_selectors_buffer(CudaStreams streams, int_radix_params params,
uint32_t num_possible_values,
uint32_t num_blocks, bool allocate_gpu_memory,
@@ -39,26 +39,15 @@ template <typename Torus> struct int_equality_selectors_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->sub_streams = new CudaStreams[num_streams_to_use];
this->outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
for (uint32_t i = 0; i < num_streams_to_use; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
uint32_t ciphertext_modulus = params.message_modulus * params.carry_modulus;
uint32_t box_size = params.polynomial_size / ciphertext_modulus;
this->lut_stride = (ciphertext_modulus / params.message_modulus) * box_size;
lut_stride = (ciphertext_modulus / params.message_modulus) * box_size;
this->comparison_luts = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, params.message_modulus,
@@ -100,8 +89,8 @@ template <typename Torus> struct int_equality_selectors_buffer {
size_tracker, allocate_gpu_memory);
this->reduction_buffers[j] = new int_comparison_buffer<Torus>(
streams, COMPARISON_TYPE::EQ, params, num_blocks, false,
allocate_gpu_memory, size_tracker);
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
size_tracker);
}
}
@@ -128,22 +117,7 @@ template <typename Torus> struct int_equality_selectors_buffer {
}
delete[] this->reduction_buffers;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
for (uint32_t i = 0; i < num_streams; i++) {
for (uint32_t g = 0; g < active_streams.count(); g++) {
cuda_event_destroy(outgoing_events[i * active_streams.count() + g],
active_streams.gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
@@ -159,15 +133,13 @@ template <typename Torus> struct int_possible_results_buffer {
uint32_t lut_stride;
int_radix_lut<Torus> **stream_luts;
CudaRadixCiphertextFFI **tmp_many_luts_output;
CudaStreams active_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
InternalCudaStreams internal_cuda_streams;
uint32_t num_streams;
CudaRadixCiphertextFFI **tmp_many_luts_output;
int_possible_results_buffer(CudaStreams streams, int_radix_params params,
uint32_t num_blocks, uint32_t num_possible_values,
bool allocate_gpu_memory,
@@ -181,22 +153,11 @@ template <typename Torus> struct int_possible_results_buffer {
num_streams_to_use = 1;
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->sub_streams = new CudaStreams[num_streams_to_use];
this->outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
for (uint32_t i = 0; i < num_streams_to_use; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->max_packed_value = params.message_modulus * params.message_modulus;
uint32_t total_luts_needed = this->max_packed_value;
@@ -211,10 +172,11 @@ template <typename Torus> struct int_possible_results_buffer {
this->lut_stride =
(ciphertext_modulus / this->max_luts_per_call) * box_size;
this->num_lut_accumulators =
(total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
this->stream_luts =
stream_luts =
new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
std::vector<std::function<Torus(Torus)>> fns;
@@ -229,9 +191,9 @@ template <typename Torus> struct int_possible_results_buffer {
uint32_t luts_in_this_call =
std::min(max_luts_per_call, total_luts_needed - lut_value_start);
int_radix_lut<Torus> *current_lut = new int_radix_lut<Torus>(
sub_streams[s], params, 1, 1, luts_in_this_call,
allocate_gpu_memory, size_tracker);
int_radix_lut<Torus> *current_lut =
new int_radix_lut<Torus>(streams, params, 1, 1, luts_in_this_call,
allocate_gpu_memory, size_tracker);
for (uint32_t j = 0; j < luts_in_this_call; j++) {
uint32_t c = lut_value_start + j;
@@ -245,7 +207,7 @@ template <typename Torus> struct int_possible_results_buffer {
params.message_modulus, params.carry_modulus, fns,
allocate_gpu_memory);
current_lut->broadcast_lut(sub_streams[s].active_gpu_subset(1));
current_lut->broadcast_lut(streams.active_gpu_subset(1));
stream_luts[lut_count++] = current_lut;
lut_value_start += luts_in_this_call;
}
@@ -278,23 +240,7 @@ template <typename Torus> struct int_possible_results_buffer {
}
delete[] this->tmp_many_luts_output;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
for (uint32_t s = 0; s < this->num_streams; ++s) {
for (uint32_t g = 0; g < active_streams.count(); ++g) {
cuda_event_destroy(
this->outgoing_events[s * active_streams.count() + g],
this->sub_streams[s].gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
@@ -309,21 +255,16 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
int_radix_lut<Torus> *message_extract_lut;
int_radix_lut<Torus> *carry_extract_lut;
CudaRadixCiphertextFFI **partial_aggregated_vectors;
CudaRadixCiphertextFFI **partial_temp_vectors;
CudaRadixCiphertextFFI *message_ct;
CudaRadixCiphertextFFI *carry_ct;
CudaStreams active_streams;
InternalCudaStreams internal_cuda_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
uint32_t num_streams;
cudaEvent_t reduction_done_event;
cudaEvent_t *message_done_events;
cudaEvent_t *carry_done_events;
CudaRadixCiphertextFFI **partial_aggregated_vectors;
CudaRadixCiphertextFFI **partial_temp_vectors;
CudaRadixCiphertextFFI *message_ct;
CudaRadixCiphertextFFI *carry_ct;
int_aggregate_one_hot_buffer(CudaStreams streams, int_radix_params params,
uint32_t num_blocks, uint32_t num_matches,
@@ -340,40 +281,18 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
num_streams_to_use = std::max((uint32_t)2, num_streams_to_use);
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->reduction_done_event = cuda_create_event(streams.gpu_index(0));
this->message_done_events = new cudaEvent_t[num_gpus];
this->carry_done_events = new cudaEvent_t[num_gpus];
for (uint32_t g = 0; g < num_gpus; g++) {
this->message_done_events[g] =
cuda_create_event(active_streams.gpu_index(g));
this->carry_done_events[g] =
cuda_create_event(active_streams.gpu_index(g));
}
this->sub_streams = new CudaStreams[num_streams];
this->outgoing_events = new cudaEvent_t[num_streams * num_gpus];
for (uint32_t i = 0; i < num_streams; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams);
this->stream_identity_luts = new int_radix_lut<Torus> *[num_streams];
std::function<Torus(Torus)> id_fn = [](Torus x) -> Torus { return x; };
for (uint32_t i = 0; i < num_streams; i++) {
int_radix_lut<Torus> *lut =
new int_radix_lut<Torus>(sub_streams[i], params, 1, num_blocks,
allocate_gpu_memory, size_tracker);
int_radix_lut<Torus> *lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
@@ -381,7 +300,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
id_fn, allocate_gpu_memory);
lut->broadcast_lut(sub_streams[i].active_gpu_subset(num_blocks));
lut->broadcast_lut(streams.active_gpu_subset(num_blocks));
this->stream_identity_luts[i] = lut;
}
@@ -392,9 +311,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
return x / params.message_modulus;
};
this->message_extract_lut =
new int_radix_lut<Torus>(sub_streams[0], params, 1, num_blocks,
allocate_gpu_memory, size_tracker);
this->message_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->message_extract_lut->get_lut(0, 0),
@@ -403,11 +321,10 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
msg_fn, allocate_gpu_memory);
this->message_extract_lut->broadcast_lut(
sub_streams[0].active_gpu_subset(num_blocks));
streams.active_gpu_subset(num_blocks));
this->carry_extract_lut =
new int_radix_lut<Torus>(sub_streams[1], params, 1, num_blocks,
allocate_gpu_memory, size_tracker);
this->carry_extract_lut = new int_radix_lut<Torus>(
streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
this->carry_extract_lut->get_lut(0, 0),
@@ -416,7 +333,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
carry_fn, allocate_gpu_memory);
this->carry_extract_lut->broadcast_lut(
sub_streams[1].active_gpu_subset(num_blocks));
streams.active_gpu_subset(num_blocks));
this->partial_aggregated_vectors =
new CudaRadixCiphertextFFI *[num_streams];
@@ -481,31 +398,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
this->carry_ct, this->allocate_gpu_memory);
delete this->carry_ct;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
cuda_event_destroy(reduction_done_event, streams.gpu_index(0));
for (uint g = 0; g < active_streams.count(); g++) {
cuda_event_destroy(message_done_events[g], active_streams.gpu_index(g));
cuda_event_destroy(carry_done_events[g], active_streams.gpu_index(g));
}
delete[] message_done_events;
delete[] carry_done_events;
for (uint32_t s = 0; s < this->num_streams; ++s) {
for (uint32_t g = 0; g < active_streams.count(); ++g) {
cuda_event_destroy(
this->outgoing_events[s * active_streams.count() + g],
this->sub_streams[s].gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
@@ -718,10 +611,7 @@ template <typename Torus> struct int_unchecked_contains_buffer {
CudaRadixCiphertextFFI *packed_selectors;
CudaStreams active_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
InternalCudaStreams internal_cuda_streams;
uint32_t num_streams;
int_unchecked_contains_buffer(CudaStreams streams, int_radix_params params,
@@ -740,20 +630,8 @@ template <typename Torus> struct int_unchecked_contains_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->sub_streams = new CudaStreams[num_streams_to_use];
this->outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
for (uint32_t i = 0; i < num_streams_to_use; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
@@ -788,23 +666,7 @@ template <typename Torus> struct int_unchecked_contains_buffer {
this->allocate_gpu_memory);
delete this->packed_selectors;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
for (uint32_t s = 0; s < this->num_streams; ++s) {
for (uint32_t g = 0; g < active_streams.count(); ++g) {
cuda_event_destroy(
this->outgoing_events[s * active_streams.count() + g],
this->sub_streams[s].gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
@@ -823,10 +685,7 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
Torus *d_clear_val;
CudaStreams active_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
InternalCudaStreams internal_cuda_streams;
uint32_t num_streams;
int_unchecked_contains_clear_buffer(CudaStreams streams,
@@ -846,20 +705,8 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->sub_streams = new CudaStreams[num_streams_to_use];
this->outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
for (uint32_t i = 0; i < num_streams_to_use; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
@@ -911,23 +758,7 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
cuda_drop_async(this->d_clear_val, streams.stream(0), streams.gpu_index(0));
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
for (uint32_t s = 0; s < this->num_streams; ++s) {
for (uint32_t g = 0; g < active_streams.count(); ++g) {
cuda_event_destroy(
this->outgoing_events[s * active_streams.count() + g],
this->sub_streams[s].gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
@@ -968,6 +799,7 @@ template <typename Torus> struct int_unchecked_is_in_clears_buffer {
allocate_gpu_memory);
this->unpacked_selectors = new CudaRadixCiphertextFFI[num_clears];
for (uint32_t i = 0; i < num_clears; i++) {
as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
this->packed_selectors, i, i + 1);
@@ -1245,10 +1077,7 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
int_radix_lut<Torus> *cleanup_lut;
CudaStreams active_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
InternalCudaStreams internal_cuda_streams;
uint32_t num_streams;
int_unchecked_first_index_of_clear_buffer(
@@ -1267,19 +1096,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->sub_streams = new CudaStreams[num_streams_to_use];
this->outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
for (uint32_t i = 0; i < num_streams_to_use; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
uint32_t packed_len = (num_blocks_index + 1) / 2;
@@ -1332,18 +1150,18 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
size_tracker, allocate_gpu_memory);
this->h_indices = nullptr;
h_indices = nullptr;
if (allocate_gpu_memory) {
uint32_t num_bits_in_message = log2_int(params.message_modulus);
uint32_t bits_per_packed_block = 2 * num_bits_in_message;
this->h_indices = new uint64_t[num_inputs * packed_len];
h_indices = new uint64_t[num_inputs * packed_len];
for (uint32_t i = 0; i < num_inputs; i++) {
uint64_t val = i;
for (uint32_t b = 0; b < packed_len; b++) {
uint64_t mask = (1ULL << bits_per_packed_block) - 1;
uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask;
this->h_indices[i * packed_len + b] = block_val;
h_indices[i * packed_len + b] = block_val;
}
}
}
@@ -1428,23 +1246,7 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
cuda_drop_async(this->d_clear_val, streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
for (uint32_t s = 0; s < this->num_streams; ++s) {
for (uint32_t g = 0; g < active_streams.count(); ++g) {
cuda_event_destroy(
this->outgoing_events[s * active_streams.count() + g],
this->sub_streams[s].gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
@@ -1471,10 +1273,7 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
int_radix_lut<Torus> *cleanup_lut;
CudaStreams active_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
InternalCudaStreams internal_cuda_streams;
uint32_t num_streams;
int_unchecked_first_index_of_buffer(CudaStreams streams,
@@ -1495,19 +1294,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->sub_streams = new CudaStreams[num_streams_to_use];
this->outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
for (uint32_t i = 0; i < num_streams_to_use; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
uint32_t packed_len = (num_blocks_index + 1) / 2;
@@ -1550,18 +1338,18 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
}
this->h_indices = nullptr;
h_indices = nullptr;
if (allocate_gpu_memory) {
uint32_t num_bits_in_message = log2_int(params.message_modulus);
uint32_t bits_per_packed_block = 2 * num_bits_in_message;
this->h_indices = new uint64_t[num_inputs * packed_len];
h_indices = new uint64_t[num_inputs * packed_len];
for (uint32_t i = 0; i < num_inputs; i++) {
uint64_t val = i;
for (uint32_t b = 0; b < packed_len; b++) {
uint64_t mask = (1ULL << bits_per_packed_block) - 1;
uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask;
this->h_indices[i * packed_len + b] = block_val;
h_indices[i * packed_len + b] = block_val;
}
}
}
@@ -1639,23 +1427,7 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
}
delete[] this->possible_results_ct_list;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
for (uint32_t s = 0; s < this->num_streams; ++s) {
for (uint32_t g = 0; g < active_streams.count(); ++g) {
cuda_event_destroy(
this->outgoing_events[s * active_streams.count() + g],
this->sub_streams[s].gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
@@ -1672,10 +1444,7 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
int_final_index_from_selectors_buffer<Torus> *final_index_buf;
CudaStreams active_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
InternalCudaStreams internal_cuda_streams;
uint32_t num_streams;
int_unchecked_index_of_buffer(CudaStreams streams, int_radix_params params,
@@ -1695,19 +1464,8 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->sub_streams = new CudaStreams[num_streams_to_use];
this->outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
for (uint32_t i = 0; i < num_streams_to_use; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
@@ -1731,23 +1489,7 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
this->final_index_buf->release(streams);
delete this->final_index_buf;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
for (uint32_t s = 0; s < this->num_streams; ++s) {
for (uint32_t g = 0; g < active_streams.count(); ++g) {
cuda_event_destroy(
this->outgoing_events[s * active_streams.count() + g],
this->sub_streams[s].gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}
@@ -1762,10 +1504,7 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
int_final_index_from_selectors_buffer<Torus> *final_index_buf;
CudaStreams active_streams;
CudaStreams *sub_streams;
cudaEvent_t incoming_event;
cudaEvent_t *outgoing_events;
InternalCudaStreams internal_cuda_streams;
uint32_t num_streams;
int_unchecked_index_of_clear_buffer(CudaStreams streams,
@@ -1786,19 +1525,8 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
this->num_streams = num_streams_to_use;
this->active_streams = streams.active_gpu_subset(num_blocks);
uint32_t num_gpus = active_streams.count();
this->incoming_event = cuda_create_event(streams.gpu_index(0));
this->sub_streams = new CudaStreams[num_streams_to_use];
this->outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
for (uint32_t i = 0; i < num_streams_to_use; i++) {
this->sub_streams[i].create_on_same_gpus(active_streams);
for (uint32_t g = 0; g < num_gpus; g++) {
this->outgoing_events[i * num_gpus + g] =
cuda_create_event(active_streams.gpu_index(g));
}
}
this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
active_streams, num_streams_to_use);
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
for (uint32_t i = 0; i < num_streams; i++) {
@@ -1822,23 +1550,7 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
this->final_index_buf->release(streams);
delete this->final_index_buf;
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
cuda_event_destroy(incoming_event, streams.gpu_index(0));
for (uint32_t s = 0; s < this->num_streams; ++s) {
for (uint32_t g = 0; g < active_streams.count(); ++g) {
cuda_event_destroy(
this->outgoing_events[s * active_streams.count() + g],
this->sub_streams[s].gpu_index(g));
}
}
delete[] outgoing_events;
for (uint32_t i = 0; i < num_streams; i++) {
sub_streams[i].release();
}
delete[] sub_streams;
internal_cuda_streams.release(streams);
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
}

View File

@@ -2261,53 +2261,34 @@ void host_single_borrow_propagate(CudaStreams streams,
params.carry_modulus);
}
cuda_event_record(mem->incoming_events[0], streams.stream(0),
streams.gpu_index(0));
for (int j = 0; j < mem->active_streams.count(); j++) {
cuda_stream_wait_event(mem->sub_streams_1.stream(j),
mem->incoming_events[0],
mem->sub_streams_1.gpu_index(j));
cuda_stream_wait_event(mem->sub_streams_2.stream(j),
mem->incoming_events[0],
mem->sub_streams_1.gpu_index(j));
}
mem->internal_streams.internal_streams_wait_for_main_stream_0(streams);
CudaStreams sub_streams_1 = mem->internal_streams[0];
CudaStreams sub_streams_2 = mem->internal_streams[1];
if (compute_overflow == outputFlag::FLAG_OVERFLOW) {
auto borrow_flag = mem->lut_borrow_flag;
integer_radix_apply_univariate_lookup_table<Torus>(
mem->sub_streams_1, overflow_block, mem->overflow_block, bsks, ksks,
sub_streams_1, overflow_block, mem->overflow_block, bsks, ksks,
borrow_flag, 1);
}
for (int j = 0; j < mem->active_streams.count(); j++) {
cuda_event_record(mem->outgoing_events1[j], mem->sub_streams_1.stream(j),
mem->sub_streams_1.gpu_index(j));
}
// subtract borrow and cleanup prepared blocks
auto resolved_carries = mem->prop_simu_group_carries_mem->resolved_carries;
host_negation<Torus>(
mem->sub_streams_2.stream(0), mem->sub_streams_2.gpu_index(0),
(Torus *)resolved_carries->ptr, (Torus *)resolved_carries->ptr,
big_lwe_dimension, num_groups);
host_negation<Torus>(sub_streams_2.stream(0), sub_streams_2.gpu_index(0),
(Torus *)resolved_carries->ptr,
(Torus *)resolved_carries->ptr, big_lwe_dimension,
num_groups);
host_radix_sum_in_groups<Torus>(
mem->sub_streams_2.stream(0), mem->sub_streams_2.gpu_index(0),
prepared_blocks, prepared_blocks, resolved_carries, num_radix_blocks,
mem->group_size);
sub_streams_2.stream(0), sub_streams_2.gpu_index(0), prepared_blocks,
prepared_blocks, resolved_carries, num_radix_blocks, mem->group_size);
auto message_extract = mem->lut_message_extract;
integer_radix_apply_univariate_lookup_table<Torus>(
mem->sub_streams_2, lwe_array, prepared_blocks, bsks, ksks,
message_extract, num_radix_blocks);
sub_streams_2, lwe_array, prepared_blocks, bsks, ksks, message_extract,
num_radix_blocks);
for (int j = 0; j < mem->active_streams.count(); j++) {
cuda_event_record(mem->outgoing_events2[j], mem->sub_streams_2.stream(j),
mem->sub_streams_2.gpu_index(j));
cuda_stream_wait_event(streams.stream(0), mem->outgoing_events1[j],
streams.gpu_index(0));
cuda_stream_wait_event(streams.stream(0), mem->outgoing_events2[j],
streams.gpu_index(0));
}
mem->internal_streams.main_stream_0_wait_for_internal_streams(streams);
}
/// num_radix_blocks corresponds to the number of blocks on which to apply the

View File

@@ -24,25 +24,16 @@ __host__ void host_compute_equality_selectors(
(Torus *const *)ksks, mem_ptr->comparison_luts, message_modulus,
mem_ptr->lut_stride);
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_possible_values; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[stream_idx];
CudaRadixCiphertextFFI *current_tmp_block_comparisons =
mem_ptr->tmp_block_comparisons[stream_idx];
@@ -75,16 +66,8 @@ __host__ void host_compute_equality_selectors(
current_reduction_buffer, bsks, (Torus **)ksks, num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
}
template <typename Torus>
@@ -113,23 +96,14 @@ __host__ void host_create_possible_results(
uint32_t max_luts_per_call = mem_ptr->max_luts_per_call;
uint32_t num_lut_accumulators = mem_ptr->num_lut_accumulators;
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
for (uint32_t i = 0; i < num_possible_values; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[stream_idx];
CudaRadixCiphertextFFI *current_tmp_buffer =
mem_ptr->tmp_many_luts_output[stream_idx];
@@ -174,16 +148,8 @@ __host__ void host_create_possible_results(
}
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
}
template <typename Torus>
@@ -211,25 +177,16 @@ __host__ void host_aggregate_one_hot_vector(
int_radix_params params = mem_ptr->params;
uint32_t chunk_size = mem_ptr->chunk_size;
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t s = 0; s < num_streams; s++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[s].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[s].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t inputs_per_stream =
(num_input_ciphertexts + num_streams - 1) / num_streams;
for (uint32_t s = 0; s < num_streams; s++) {
CudaStreams current_stream = mem_ptr->sub_streams[s];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[s];
CudaRadixCiphertextFFI *current_agg =
mem_ptr->partial_aggregated_vectors[s];
@@ -291,16 +248,8 @@ __host__ void host_aggregate_one_hot_vector(
}
}
for (uint32_t s = 0; s < num_streams; s++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[s * num_gpus + i],
mem_ptr->sub_streams[s].stream(i),
mem_ptr->sub_streams[s].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[s * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
CudaRadixCiphertextFFI *final_agg = mem_ptr->partial_aggregated_vectors[0];
@@ -329,20 +278,14 @@ __host__ void host_aggregate_one_hot_vector(
streams.stream(0), streams.gpu_index(0), temp_agg, 0, num_blocks,
final_agg, 0, num_blocks);
CudaStreams message_stream = mem_ptr->sub_streams[0];
CudaStreams carry_stream = mem_ptr->sub_streams[1];
CudaStreams message_stream = mem_ptr->internal_cuda_streams[0];
CudaStreams carry_stream = mem_ptr->internal_cuda_streams[1];
cuda_event_record(mem_ptr->reduction_done_event, streams.stream(0),
streams.gpu_index(0));
uint32_t stream_indexes[] = {0, 1};
size_t num_stream_indexes = 2;
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(message_stream.stream(i),
mem_ptr->reduction_done_event,
message_stream.gpu_index(i));
cuda_stream_wait_event(carry_stream.stream(i),
mem_ptr->reduction_done_event,
carry_stream.gpu_index(i));
}
mem_ptr->internal_cuda_streams.internal_streams_slice_wait_for_main_stream_0(
streams, stream_indexes, num_stream_indexes);
//
// Extract message part on a first substream
@@ -358,17 +301,8 @@ __host__ void host_aggregate_one_hot_vector(
carry_stream, carry_ct, temp_agg, bsks, ksks, mem_ptr->carry_extract_lut,
num_blocks);
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->message_done_events[i], message_stream.stream(i),
message_stream.gpu_index(i));
cuda_event_record(mem_ptr->carry_done_events[i], carry_stream.stream(i),
carry_stream.gpu_index(i));
cuda_stream_wait_event(streams.stream(0), mem_ptr->message_done_events[i],
streams.gpu_index(0));
cuda_stream_wait_event(streams.stream(0), mem_ptr->carry_done_events[i],
streams.gpu_index(0));
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams_slice(
streams, stream_indexes, num_stream_indexes);
//
// Pack the message and carry parts into the output LWE array
@@ -530,23 +464,14 @@ host_unchecked_contains(CudaStreams streams, CudaRadixCiphertextFFI *output,
int_unchecked_contains_buffer<Torus> *mem_ptr,
void *const *bsks, Torus *const *ksks) {
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
@@ -559,16 +484,8 @@ host_unchecked_contains(CudaStreams streams, CudaRadixCiphertextFFI *output,
bsks, ksks, num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
@@ -606,23 +523,14 @@ __host__ void host_unchecked_contains_clear(
mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks,
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus);
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
@@ -636,16 +544,8 @@ __host__ void host_unchecked_contains_clear(
num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
host_integer_is_at_least_one_comparisons_block_true<Torus>(
streams, output, mem_ptr->packed_selectors, mem_ptr->reduction_buffer,
@@ -853,23 +753,14 @@ __host__ void host_unchecked_first_index_of_clear(
mem_ptr->d_clear_val, (Torus *)h_clear_val, num_blocks,
mem_ptr->params.message_modulus, mem_ptr->params.carry_modulus);
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
@@ -883,16 +774,8 @@ __host__ void host_unchecked_first_index_of_clear(
num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) {
uint32_t count = num_inputs - offset;
@@ -953,23 +836,14 @@ __host__ void host_unchecked_first_index_of(
int_unchecked_first_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
@@ -982,16 +856,8 @@ __host__ void host_unchecked_first_index_of(
bsks, ksks, num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
for (uint32_t offset = 1; offset < num_inputs; offset <<= 1) {
uint32_t count = num_inputs - offset;
@@ -1052,23 +918,14 @@ __host__ void host_unchecked_index_of(
int_unchecked_index_of_buffer<Torus> *mem_ptr, void *const *bsks,
Torus *const *ksks) {
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
@@ -1082,16 +939,8 @@ __host__ void host_unchecked_index_of(
bsks, ksks, num_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
uint32_t packed_len = (num_blocks_index + 1) / 2;
@@ -1144,23 +993,14 @@ __host__ void host_unchecked_index_of_clear(
streams.stream(0), streams.gpu_index(0), packed_selectors, 0,
num_inputs);
} else {
cuda_event_record(mem_ptr->incoming_event, streams.stream(0),
streams.gpu_index(0));
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_stream_wait_event(mem_ptr->sub_streams[j].stream(i),
mem_ptr->incoming_event,
mem_ptr->sub_streams[j].gpu_index(i));
}
}
mem_ptr->internal_cuda_streams.internal_streams_wait_for_main_stream_0(
streams);
uint32_t num_streams = mem_ptr->num_streams;
uint32_t num_gpus = mem_ptr->active_streams.count();
for (uint32_t i = 0; i < num_inputs; i++) {
uint32_t stream_idx = i % num_streams;
CudaStreams current_stream = mem_ptr->sub_streams[stream_idx];
CudaStreams current_stream = mem_ptr->internal_cuda_streams[stream_idx];
CudaRadixCiphertextFFI const *input_ct = &inputs[i];
@@ -1174,16 +1014,8 @@ __host__ void host_unchecked_index_of_clear(
num_scalar_blocks);
}
for (uint32_t j = 0; j < mem_ptr->num_streams; j++) {
for (uint32_t i = 0; i < mem_ptr->active_streams.count(); i++) {
cuda_event_record(mem_ptr->outgoing_events[j * num_gpus + i],
mem_ptr->sub_streams[j].stream(i),
mem_ptr->sub_streams[j].gpu_index(i));
cuda_stream_wait_event(streams.stream(0),
mem_ptr->outgoing_events[j * num_gpus + i],
streams.gpu_index(0));
}
}
mem_ptr->internal_cuda_streams.main_stream_0_wait_for_internal_streams(
streams);
}
uint32_t packed_len = (num_blocks_index + 1) / 2;