mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
1818 lines
66 KiB
C++
1818 lines
66 KiB
C++
#pragma once
|
|
#include "cast.h"
|
|
#include "integer/comparison.h"
|
|
#include "integer/radix_ciphertext.cuh"
|
|
#include "integer_utilities.h"
|
|
#include <functional>
|
|
#include <vector>
|
|
|
|
const uint32_t MAX_STREAMS_FOR_VECTOR_FIND = 10;
|
|
|
|
template <typename Torus> struct int_equality_selectors_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t lut_stride;
|
|
|
|
uint32_t num_possible_values;
|
|
int_radix_lut<Torus> *comparison_luts;
|
|
CudaRadixCiphertextFFI *tmp_many_luts_output;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
uint32_t num_streams;
|
|
|
|
CudaRadixCiphertextFFI **tmp_block_comparisons;
|
|
int_comparison_buffer<Torus> **reduction_buffers;
|
|
|
|
int_equality_selectors_buffer(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_possible_values,
|
|
uint32_t num_blocks, bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_possible_values = num_possible_values;
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_possible_values);
|
|
if (num_streams_to_use == 0)
|
|
num_streams_to_use = 1;
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
uint32_t num_gpus = active_streams.count();
|
|
|
|
incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
|
|
sub_streams = new CudaStreams[num_streams_to_use];
|
|
outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
|
|
|
|
for (uint32_t i = 0; i < num_streams_to_use; i++) {
|
|
sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < num_gpus; j++) {
|
|
outgoing_events[i * num_gpus + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
uint32_t ciphertext_modulus = params.message_modulus * params.carry_modulus;
|
|
uint32_t box_size = params.polynomial_size / ciphertext_modulus;
|
|
lut_stride = (ciphertext_modulus / params.message_modulus) * box_size;
|
|
|
|
this->comparison_luts = new int_radix_lut<Torus>(
|
|
streams, params, 1, num_blocks, params.message_modulus,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
std::vector<std::function<Torus(Torus)>> fns;
|
|
fns.reserve(params.message_modulus);
|
|
for (uint32_t i = 0; i < params.message_modulus; i++) {
|
|
fns.push_back([i](Torus x) -> Torus { return (x == i); });
|
|
}
|
|
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->comparison_luts->get_lut(0, 0),
|
|
this->comparison_luts->get_degree(0),
|
|
this->comparison_luts->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
fns, allocate_gpu_memory);
|
|
|
|
fns.clear();
|
|
|
|
this->comparison_luts->broadcast_lut(active_streams);
|
|
|
|
this->tmp_many_luts_output = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->tmp_many_luts_output,
|
|
params.message_modulus * num_blocks, params.big_lwe_dimension,
|
|
size_tracker, allocate_gpu_memory);
|
|
|
|
this->tmp_block_comparisons =
|
|
new CudaRadixCiphertextFFI *[this->num_streams];
|
|
this->reduction_buffers =
|
|
new int_comparison_buffer<Torus> *[this->num_streams];
|
|
for (uint32_t j = 0; j < this->num_streams; j++) {
|
|
this->tmp_block_comparisons[j] = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_block_comparisons[j], num_blocks, params.big_lwe_dimension,
|
|
size_tracker, allocate_gpu_memory);
|
|
|
|
this->reduction_buffers[j] = new int_comparison_buffer<Torus>(
|
|
streams, COMPARISON_TYPE::EQ, params, num_blocks, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
}
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
this->comparison_luts->release(streams);
|
|
delete this->comparison_luts;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_many_luts_output,
|
|
this->allocate_gpu_memory);
|
|
delete this->tmp_many_luts_output;
|
|
|
|
for (uint32_t i = 0; i < this->num_streams; i++) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_block_comparisons[i],
|
|
this->allocate_gpu_memory);
|
|
delete this->tmp_block_comparisons[i];
|
|
}
|
|
delete[] this->tmp_block_comparisons;
|
|
|
|
for (uint32_t i = 0; i < this->num_streams; i++) {
|
|
this->reduction_buffers[i]->release(streams);
|
|
delete this->reduction_buffers[i];
|
|
}
|
|
delete[] this->reduction_buffers;
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
for (uint32_t j = 0; j < num_gpus; j++) {
|
|
cuda_event_destroy(outgoing_events[i * num_gpus + j],
|
|
active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_possible_results_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
|
|
uint32_t max_packed_value;
|
|
uint32_t max_luts_per_call;
|
|
uint32_t num_lut_accumulators;
|
|
uint32_t lut_stride;
|
|
|
|
int_radix_lut<Torus> **stream_luts;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
uint32_t num_streams;
|
|
|
|
CudaRadixCiphertextFFI **tmp_many_luts_output;
|
|
|
|
int_possible_results_buffer(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_blocks, uint32_t num_possible_values,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_possible_values);
|
|
if (num_streams_to_use == 0)
|
|
num_streams_to_use = 1;
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
uint32_t num_gpus = active_streams.count();
|
|
|
|
incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
|
|
sub_streams = new CudaStreams[num_streams_to_use];
|
|
outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
|
|
|
|
for (uint32_t i = 0; i < num_streams_to_use; i++) {
|
|
sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < num_gpus; j++) {
|
|
outgoing_events[i * num_gpus + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
this->max_packed_value = params.message_modulus * params.message_modulus;
|
|
uint32_t total_luts_needed = this->max_packed_value;
|
|
|
|
uint32_t ciphertext_modulus = params.message_modulus * params.carry_modulus;
|
|
uint32_t box_size = params.polynomial_size / ciphertext_modulus;
|
|
|
|
this->max_luts_per_call = (ciphertext_modulus) / 2;
|
|
if (this->max_luts_per_call == 0) {
|
|
this->max_luts_per_call = 1;
|
|
}
|
|
|
|
this->lut_stride =
|
|
(ciphertext_modulus / this->max_luts_per_call) * box_size;
|
|
|
|
this->num_lut_accumulators =
|
|
(total_luts_needed + max_luts_per_call - 1) / max_luts_per_call;
|
|
|
|
stream_luts =
|
|
new int_radix_lut<Torus> *[num_streams * num_lut_accumulators];
|
|
|
|
std::vector<std::function<Torus(Torus)>> fns;
|
|
fns.reserve(max_luts_per_call);
|
|
|
|
uint32_t lut_count = 0;
|
|
for (uint32_t s = 0; s < num_streams; s++) {
|
|
uint32_t lut_value_start = 0;
|
|
|
|
for (uint32_t i = 0; i < num_lut_accumulators; i++) {
|
|
fns.clear();
|
|
uint32_t luts_in_this_call =
|
|
std::min(max_luts_per_call, total_luts_needed - lut_value_start);
|
|
|
|
int_radix_lut<Torus> *current_lut = new int_radix_lut<Torus>(
|
|
sub_streams[s], params, 1, 1, luts_in_this_call,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
for (uint32_t j = 0; j < luts_in_this_call; j++) {
|
|
uint32_t c = lut_value_start + j;
|
|
fns.push_back([c](Torus x) -> Torus { return (x == 1) * c; });
|
|
}
|
|
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), current_lut->get_lut(0, 0),
|
|
current_lut->get_degree(0), current_lut->get_max_degree(0),
|
|
params.glwe_dimension, params.polynomial_size,
|
|
params.message_modulus, params.carry_modulus, fns,
|
|
allocate_gpu_memory);
|
|
|
|
current_lut->broadcast_lut(sub_streams[s].active_gpu_subset(1));
|
|
stream_luts[lut_count++] = current_lut;
|
|
lut_value_start += luts_in_this_call;
|
|
}
|
|
}
|
|
fns.clear();
|
|
|
|
this->tmp_many_luts_output =
|
|
new CudaRadixCiphertextFFI *[this->num_streams];
|
|
for (uint32_t j = 0; j < this->num_streams; j++) {
|
|
this->tmp_many_luts_output[j] = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_many_luts_output[j], max_luts_per_call,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
}
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
for (uint32_t i = 0; i < num_streams * num_lut_accumulators; i++) {
|
|
stream_luts[i]->release(streams);
|
|
delete stream_luts[i];
|
|
}
|
|
delete[] stream_luts;
|
|
|
|
for (uint32_t i = 0; i < this->num_streams; i++) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_many_luts_output[i],
|
|
this->allocate_gpu_memory);
|
|
delete this->tmp_many_luts_output[i];
|
|
}
|
|
delete[] this->tmp_many_luts_output;
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint j = 0; j < num_streams; j++) {
|
|
for (uint k = 0; k < num_gpus; k++) {
|
|
cuda_event_destroy(outgoing_events[j * num_gpus + k],
|
|
active_streams.gpu_index(k));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_aggregate_one_hot_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t chunk_size;
|
|
|
|
int_radix_lut<Torus> **stream_identity_luts;
|
|
int_radix_lut<Torus> *message_extract_lut;
|
|
int_radix_lut<Torus> *carry_extract_lut;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
|
|
cudaEvent_t reduction_done_event;
|
|
cudaEvent_t *message_done_events;
|
|
cudaEvent_t *carry_done_events;
|
|
|
|
uint32_t num_streams;
|
|
|
|
CudaRadixCiphertextFFI **partial_aggregated_vectors;
|
|
CudaRadixCiphertextFFI **partial_temp_vectors;
|
|
|
|
CudaRadixCiphertextFFI *message_ct;
|
|
CudaRadixCiphertextFFI *carry_ct;
|
|
|
|
int_aggregate_one_hot_buffer(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_blocks, uint32_t num_matches,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
|
|
uint32_t total_modulus = params.message_modulus * params.carry_modulus;
|
|
this->chunk_size = (total_modulus - 1) / (params.message_modulus - 1);
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_matches);
|
|
num_streams_to_use = std::max((uint32_t)2, num_streams_to_use);
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
uint32_t num_gpus = active_streams.count();
|
|
|
|
this->incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
this->reduction_done_event = cuda_create_event(streams.gpu_index(0));
|
|
|
|
this->message_done_events = new cudaEvent_t[num_gpus];
|
|
this->carry_done_events = new cudaEvent_t[num_gpus];
|
|
for (uint32_t i = 0; i < num_gpus; i++) {
|
|
this->message_done_events[i] =
|
|
cuda_create_event(active_streams.gpu_index(i));
|
|
this->carry_done_events[i] =
|
|
cuda_create_event(active_streams.gpu_index(i));
|
|
}
|
|
|
|
this->sub_streams = new CudaStreams[num_streams];
|
|
this->outgoing_events = new cudaEvent_t[num_streams * num_gpus];
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
this->sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < num_gpus; j++) {
|
|
this->outgoing_events[i * num_gpus + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
this->stream_identity_luts = new int_radix_lut<Torus> *[num_streams];
|
|
std::function<Torus(Torus)> id_fn = [](Torus x) -> Torus { return x; };
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
int_radix_lut<Torus> *lut =
|
|
new int_radix_lut<Torus>(sub_streams[i], params, 1, num_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, 0),
|
|
lut->get_degree(0), lut->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
id_fn, allocate_gpu_memory);
|
|
|
|
lut->broadcast_lut(sub_streams[i].active_gpu_subset(num_blocks));
|
|
this->stream_identity_luts[i] = lut;
|
|
}
|
|
|
|
std::function<Torus(Torus)> msg_fn = [params](Torus x) -> Torus {
|
|
return (x % params.message_modulus) % params.message_modulus;
|
|
};
|
|
std::function<Torus(Torus)> carry_fn = [params](Torus x) -> Torus {
|
|
return x / params.message_modulus;
|
|
};
|
|
|
|
this->message_extract_lut =
|
|
new int_radix_lut<Torus>(sub_streams[0], params, 1, num_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->message_extract_lut->get_lut(0, 0),
|
|
this->message_extract_lut->get_degree(0),
|
|
this->message_extract_lut->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
msg_fn, allocate_gpu_memory);
|
|
this->message_extract_lut->broadcast_lut(
|
|
sub_streams[0].active_gpu_subset(num_blocks));
|
|
|
|
this->carry_extract_lut =
|
|
new int_radix_lut<Torus>(sub_streams[1], params, 1, num_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->carry_extract_lut->get_lut(0, 0),
|
|
this->carry_extract_lut->get_degree(0),
|
|
this->carry_extract_lut->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
carry_fn, allocate_gpu_memory);
|
|
this->carry_extract_lut->broadcast_lut(
|
|
sub_streams[1].active_gpu_subset(num_blocks));
|
|
|
|
this->partial_aggregated_vectors =
|
|
new CudaRadixCiphertextFFI *[num_streams];
|
|
this->partial_temp_vectors = new CudaRadixCiphertextFFI *[num_streams];
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
this->partial_aggregated_vectors[i] = new CudaRadixCiphertextFFI;
|
|
this->partial_temp_vectors[i] = new CudaRadixCiphertextFFI;
|
|
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->partial_aggregated_vectors[i], num_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->partial_temp_vectors[i], num_blocks, params.big_lwe_dimension,
|
|
size_tracker, allocate_gpu_memory);
|
|
}
|
|
|
|
this->message_ct = new CudaRadixCiphertextFFI;
|
|
this->carry_ct = new CudaRadixCiphertextFFI;
|
|
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->message_ct, num_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->carry_ct, num_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
stream_identity_luts[i]->release(streams);
|
|
delete stream_identity_luts[i];
|
|
}
|
|
delete[] stream_identity_luts;
|
|
|
|
this->message_extract_lut->release(streams);
|
|
delete this->message_extract_lut;
|
|
this->carry_extract_lut->release(streams);
|
|
delete this->carry_extract_lut;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->partial_aggregated_vectors[i],
|
|
this->allocate_gpu_memory);
|
|
delete this->partial_aggregated_vectors[i];
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->partial_temp_vectors[i],
|
|
this->allocate_gpu_memory);
|
|
delete this->partial_temp_vectors[i];
|
|
}
|
|
delete[] partial_aggregated_vectors;
|
|
delete[] partial_temp_vectors;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->message_ct, this->allocate_gpu_memory);
|
|
delete this->message_ct;
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->carry_ct, this->allocate_gpu_memory);
|
|
delete this->carry_ct;
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
cuda_event_destroy(reduction_done_event, streams.gpu_index(0));
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint i = 0; i < num_gpus; i++) {
|
|
cuda_event_destroy(message_done_events[i], active_streams.gpu_index(i));
|
|
cuda_event_destroy(carry_done_events[i], active_streams.gpu_index(i));
|
|
}
|
|
delete[] message_done_events;
|
|
delete[] carry_done_events;
|
|
|
|
for (uint j = 0; j < num_streams; j++) {
|
|
for (uint k = 0; k < num_gpus; k++) {
|
|
cuda_event_destroy(outgoing_events[j * num_gpus + k],
|
|
active_streams.gpu_index(k));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_match_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_matches;
|
|
uint32_t num_input_blocks;
|
|
uint32_t num_output_packed_blocks;
|
|
bool max_output_is_zero;
|
|
|
|
int_equality_selectors_buffer<Torus> *eq_selectors_buffer;
|
|
int_possible_results_buffer<Torus> *possible_results_buffer;
|
|
int_aggregate_one_hot_buffer<Torus> *aggregate_buffer;
|
|
int_comparison_buffer<Torus> *at_least_one_true_buffer;
|
|
|
|
CudaRadixCiphertextFFI *selectors_list;
|
|
CudaRadixCiphertextFFI *packed_selectors_ct;
|
|
CudaRadixCiphertextFFI *possible_results_list;
|
|
|
|
int_unchecked_match_buffer(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_matches, uint32_t num_input_blocks,
|
|
uint32_t num_output_packed_blocks,
|
|
bool max_output_is_zero, bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_matches = num_matches;
|
|
this->num_input_blocks = num_input_blocks;
|
|
this->num_output_packed_blocks = num_output_packed_blocks;
|
|
this->max_output_is_zero = max_output_is_zero;
|
|
|
|
this->eq_selectors_buffer = new int_equality_selectors_buffer<Torus>(
|
|
streams, params, num_matches, num_input_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->possible_results_buffer = new int_possible_results_buffer<Torus>(
|
|
streams, params, num_output_packed_blocks, num_matches,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
if (!max_output_is_zero) {
|
|
this->aggregate_buffer = new int_aggregate_one_hot_buffer<Torus>(
|
|
streams, params, num_output_packed_blocks, num_matches,
|
|
allocate_gpu_memory, size_tracker);
|
|
}
|
|
|
|
this->at_least_one_true_buffer = new int_comparison_buffer<Torus>(
|
|
streams, EQ, params, num_matches, false, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->selectors_list = new CudaRadixCiphertextFFI[num_matches];
|
|
this->possible_results_list = new CudaRadixCiphertextFFI[num_matches];
|
|
|
|
for (uint32_t i = 0; i < num_matches; i++) {
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), &this->selectors_list[i], 1,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
if (!max_output_is_zero) {
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_list[i], num_output_packed_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
}
|
|
}
|
|
|
|
this->packed_selectors_ct = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->packed_selectors_ct,
|
|
num_matches, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
this->eq_selectors_buffer->release(streams);
|
|
delete this->eq_selectors_buffer;
|
|
|
|
this->possible_results_buffer->release(streams);
|
|
delete this->possible_results_buffer;
|
|
|
|
if (!max_output_is_zero) {
|
|
this->aggregate_buffer->release(streams);
|
|
delete this->aggregate_buffer;
|
|
}
|
|
|
|
this->at_least_one_true_buffer->release(streams);
|
|
delete this->at_least_one_true_buffer;
|
|
|
|
for (uint32_t i = 0; i < num_matches; i++) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
&this->selectors_list[i],
|
|
this->allocate_gpu_memory);
|
|
if (!max_output_is_zero) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_list[i],
|
|
this->allocate_gpu_memory);
|
|
}
|
|
}
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->packed_selectors_ct,
|
|
this->allocate_gpu_memory);
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
|
|
delete[] this->selectors_list;
|
|
delete[] this->possible_results_list;
|
|
delete this->packed_selectors_ct;
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_match_value_or_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
|
|
uint32_t num_matches;
|
|
uint32_t num_input_blocks;
|
|
uint32_t num_match_packed_blocks;
|
|
uint32_t num_final_blocks;
|
|
bool max_output_is_zero;
|
|
|
|
int_unchecked_match_buffer<Torus> *match_buffer;
|
|
int_cmux_buffer<Torus> *cmux_buffer;
|
|
|
|
CudaRadixCiphertextFFI *tmp_match_result;
|
|
CudaRadixCiphertextFFI *tmp_match_bool;
|
|
CudaRadixCiphertextFFI *tmp_or_value;
|
|
|
|
Torus *d_or_value;
|
|
|
|
int_unchecked_match_value_or_buffer(
|
|
CudaStreams streams, int_radix_params params, uint32_t num_matches,
|
|
uint32_t num_input_blocks, uint32_t num_match_packed_blocks,
|
|
uint32_t num_final_blocks, bool max_output_is_zero,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_matches = num_matches;
|
|
this->num_input_blocks = num_input_blocks;
|
|
this->num_match_packed_blocks = num_match_packed_blocks;
|
|
this->num_final_blocks = num_final_blocks;
|
|
this->max_output_is_zero = max_output_is_zero;
|
|
|
|
this->match_buffer = new int_unchecked_match_buffer<Torus>(
|
|
streams, params, num_matches, num_input_blocks, num_match_packed_blocks,
|
|
max_output_is_zero, allocate_gpu_memory, size_tracker);
|
|
|
|
this->cmux_buffer = new int_cmux_buffer<Torus>(
|
|
streams, [](Torus x) -> Torus { return x == 1; }, params,
|
|
num_final_blocks, allocate_gpu_memory, size_tracker);
|
|
|
|
this->tmp_match_result = new CudaRadixCiphertextFFI;
|
|
this->tmp_match_bool = new CudaRadixCiphertextFFI;
|
|
this->tmp_or_value = new CudaRadixCiphertextFFI;
|
|
|
|
this->d_or_value = (Torus *)cuda_malloc_with_size_tracking_async(
|
|
num_final_blocks * sizeof(Torus), streams.stream(0),
|
|
streams.gpu_index(0), size_tracker, allocate_gpu_memory);
|
|
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->tmp_match_result,
|
|
num_final_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->tmp_match_bool, 1,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->tmp_or_value,
|
|
num_final_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
this->match_buffer->release(streams);
|
|
delete this->match_buffer;
|
|
|
|
this->cmux_buffer->release(streams);
|
|
delete this->cmux_buffer;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_match_result,
|
|
this->allocate_gpu_memory);
|
|
delete this->tmp_match_result;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_match_bool,
|
|
this->allocate_gpu_memory);
|
|
delete this->tmp_match_bool;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_or_value,
|
|
this->allocate_gpu_memory);
|
|
delete this->tmp_or_value;
|
|
|
|
cuda_drop_async(this->d_or_value, streams.stream(0), streams.gpu_index(0));
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_contains_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_inputs;
|
|
|
|
int_comparison_buffer<Torus> **eq_buffers;
|
|
int_comparison_buffer<Torus> *reduction_buffer;
|
|
|
|
CudaRadixCiphertextFFI *packed_selectors;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
uint32_t num_streams;
|
|
|
|
int_unchecked_contains_buffer(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_inputs, uint32_t num_blocks,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_inputs = num_inputs;
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
|
|
if (num_streams_to_use == 0)
|
|
num_streams_to_use = 1;
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
uint32_t num_gpus = active_streams.count();
|
|
|
|
incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
sub_streams = new CudaStreams[num_streams_to_use];
|
|
outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
|
|
|
|
for (uint32_t i = 0; i < num_streams_to_use; i++) {
|
|
sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < num_gpus; j++) {
|
|
outgoing_events[i * num_gpus + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
|
|
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
this->reduction_buffer =
|
|
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->packed_selectors = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
|
|
num_inputs, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
eq_buffers[i]->release(streams);
|
|
delete eq_buffers[i];
|
|
}
|
|
delete[] eq_buffers;
|
|
|
|
this->reduction_buffer->release(streams);
|
|
delete this->reduction_buffer;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->packed_selectors,
|
|
this->allocate_gpu_memory);
|
|
delete this->packed_selectors;
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint j = 0; j < num_streams; j++) {
|
|
for (uint k = 0; k < num_gpus; k++) {
|
|
cuda_event_destroy(outgoing_events[j * num_gpus + k],
|
|
active_streams.gpu_index(k));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_contains_clear_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_inputs;
|
|
|
|
int_comparison_buffer<Torus> **eq_buffers;
|
|
int_comparison_buffer<Torus> *reduction_buffer;
|
|
|
|
CudaRadixCiphertextFFI *packed_selectors;
|
|
CudaRadixCiphertextFFI *tmp_clear_val;
|
|
Torus *d_clear_val;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
uint32_t num_streams;
|
|
|
|
int_unchecked_contains_clear_buffer(CudaStreams streams,
|
|
int_radix_params params,
|
|
uint32_t num_inputs, uint32_t num_blocks,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_inputs = num_inputs;
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
|
|
if (num_streams_to_use == 0)
|
|
num_streams_to_use = 1;
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
uint32_t num_gpus = active_streams.count();
|
|
|
|
incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
sub_streams = new CudaStreams[num_streams_to_use];
|
|
outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
|
|
|
|
for (uint32_t i = 0; i < num_streams_to_use; i++) {
|
|
sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < num_gpus; j++) {
|
|
outgoing_events[i * num_gpus + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
|
|
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
this->reduction_buffer =
|
|
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->packed_selectors = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
|
|
num_inputs, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->tmp_clear_val = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->tmp_clear_val,
|
|
num_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->d_clear_val = (Torus *)cuda_malloc_with_size_tracking_async(
|
|
num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
|
|
size_tracker, allocate_gpu_memory);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
eq_buffers[i]->release(streams);
|
|
delete eq_buffers[i];
|
|
}
|
|
delete[] eq_buffers;
|
|
|
|
this->reduction_buffer->release(streams);
|
|
delete this->reduction_buffer;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->packed_selectors,
|
|
this->allocate_gpu_memory);
|
|
delete this->packed_selectors;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_clear_val,
|
|
this->allocate_gpu_memory);
|
|
delete this->tmp_clear_val;
|
|
|
|
cuda_drop_async(this->d_clear_val, streams.stream(0), streams.gpu_index(0));
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint j = 0; j < num_streams; j++) {
|
|
for (uint k = 0; k < num_gpus; k++) {
|
|
cuda_event_destroy(outgoing_events[j * num_gpus + k],
|
|
active_streams.gpu_index(k));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_is_in_clears_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_clears;
|
|
|
|
int_equality_selectors_buffer<Torus> *eq_buffer;
|
|
int_comparison_buffer<Torus> *reduction_buffer;
|
|
|
|
CudaRadixCiphertextFFI *packed_selectors;
|
|
CudaRadixCiphertextFFI *unpacked_selectors;
|
|
|
|
int_unchecked_is_in_clears_buffer(CudaStreams streams,
|
|
int_radix_params params,
|
|
uint32_t num_clears, uint32_t num_blocks,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_clears = num_clears;
|
|
|
|
this->eq_buffer = new int_equality_selectors_buffer<Torus>(
|
|
streams, params, num_clears, num_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->reduction_buffer =
|
|
new int_comparison_buffer<Torus>(streams, EQ, params, num_clears, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->packed_selectors = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
|
|
num_clears, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->unpacked_selectors = new CudaRadixCiphertextFFI[num_clears];
|
|
|
|
for (uint32_t i = 0; i < num_clears; i++) {
|
|
as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
|
|
this->packed_selectors, i, i + 1);
|
|
}
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
this->eq_buffer->release(streams);
|
|
delete this->eq_buffer;
|
|
|
|
this->reduction_buffer->release(streams);
|
|
delete this->reduction_buffer;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->packed_selectors,
|
|
this->allocate_gpu_memory);
|
|
delete this->packed_selectors;
|
|
|
|
delete[] this->unpacked_selectors;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_final_index_from_selectors_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_inputs;
|
|
|
|
int_possible_results_buffer<Torus> *possible_results_buf;
|
|
int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
|
|
int_comparison_buffer<Torus> *reduction_buf;
|
|
|
|
CudaRadixCiphertextFFI *packed_selectors;
|
|
CudaRadixCiphertextFFI *unpacked_selectors;
|
|
CudaRadixCiphertextFFI *possible_results_ct_list;
|
|
|
|
uint64_t *h_indices;
|
|
|
|
int_final_index_from_selectors_buffer(CudaStreams streams,
|
|
int_radix_params params,
|
|
uint32_t num_inputs,
|
|
uint32_t num_blocks_index,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_inputs = num_inputs;
|
|
|
|
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
|
|
|
this->possible_results_buf = new int_possible_results_buffer<Torus>(
|
|
streams, params, packed_len, num_inputs, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
|
|
streams, params, packed_len, num_inputs, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->reduction_buf =
|
|
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->packed_selectors = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
|
|
num_inputs, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
|
|
this->packed_selectors, i, i + 1);
|
|
}
|
|
|
|
this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_ct_list[i], packed_len,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
}
|
|
|
|
uint32_t num_bits_in_message = log2_int(params.message_modulus);
|
|
uint32_t bits_per_packed_block = 2 * num_bits_in_message;
|
|
|
|
h_indices = new uint64_t[num_inputs * packed_len];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
uint64_t val = i;
|
|
for (uint32_t b = 0; b < packed_len; b++) {
|
|
uint64_t mask = (1ULL << bits_per_packed_block) - 1;
|
|
uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask;
|
|
h_indices[i * packed_len + b] = block_val;
|
|
}
|
|
}
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
this->possible_results_buf->release(streams);
|
|
delete this->possible_results_buf;
|
|
|
|
this->aggregate_buf->release(streams);
|
|
delete this->aggregate_buf;
|
|
|
|
this->reduction_buf->release(streams);
|
|
delete this->reduction_buf;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->packed_selectors,
|
|
this->allocate_gpu_memory);
|
|
delete this->packed_selectors;
|
|
|
|
delete[] this->unpacked_selectors;
|
|
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_ct_list[i],
|
|
this->allocate_gpu_memory);
|
|
}
|
|
delete[] this->possible_results_ct_list;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
|
|
delete[] h_indices;
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_index_in_clears_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_clears;
|
|
|
|
int_equality_selectors_buffer<Torus> *eq_selectors_buf;
|
|
int_final_index_from_selectors_buffer<Torus> *final_index_buf;
|
|
|
|
int_unchecked_index_in_clears_buffer(CudaStreams streams,
|
|
int_radix_params params,
|
|
uint32_t num_clears, uint32_t num_blocks,
|
|
uint32_t num_blocks_index,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_clears = num_clears;
|
|
|
|
this->eq_selectors_buf = new int_equality_selectors_buffer<Torus>(
|
|
streams, params, num_clears, num_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->final_index_buf = new int_final_index_from_selectors_buffer<Torus>(
|
|
streams, params, num_clears, num_blocks_index, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
this->eq_selectors_buf->release(streams);
|
|
delete this->eq_selectors_buf;
|
|
|
|
this->final_index_buf->release(streams);
|
|
delete this->final_index_buf;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_first_index_in_clears_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_unique;
|
|
|
|
int_equality_selectors_buffer<Torus> *eq_selectors_buf;
|
|
int_possible_results_buffer<Torus> *possible_results_buf;
|
|
int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
|
|
int_comparison_buffer<Torus> *reduction_buf;
|
|
|
|
CudaRadixCiphertextFFI *packed_selectors;
|
|
CudaRadixCiphertextFFI *unpacked_selectors;
|
|
CudaRadixCiphertextFFI *possible_results_ct_list;
|
|
|
|
int_unchecked_first_index_in_clears_buffer(
|
|
CudaStreams streams, int_radix_params params, uint32_t num_unique,
|
|
uint32_t num_blocks, uint32_t num_blocks_index, bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_unique = num_unique;
|
|
|
|
this->eq_selectors_buf = new int_equality_selectors_buffer<Torus>(
|
|
streams, params, num_unique, num_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
|
this->possible_results_buf = new int_possible_results_buffer<Torus>(
|
|
streams, params, packed_len, num_unique, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
|
|
streams, params, packed_len, num_unique, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->reduction_buf =
|
|
new int_comparison_buffer<Torus>(streams, EQ, params, num_unique, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->packed_selectors = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
|
|
num_unique, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->unpacked_selectors = new CudaRadixCiphertextFFI[num_unique];
|
|
for (uint32_t i = 0; i < num_unique; i++) {
|
|
as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
|
|
this->packed_selectors, i, i + 1);
|
|
}
|
|
|
|
this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_unique];
|
|
for (uint32_t i = 0; i < num_unique; i++) {
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_ct_list[i], packed_len,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
}
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
this->eq_selectors_buf->release(streams);
|
|
delete this->eq_selectors_buf;
|
|
|
|
this->possible_results_buf->release(streams);
|
|
delete this->possible_results_buf;
|
|
|
|
this->aggregate_buf->release(streams);
|
|
delete this->aggregate_buf;
|
|
|
|
this->reduction_buf->release(streams);
|
|
delete this->reduction_buf;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->packed_selectors,
|
|
this->allocate_gpu_memory);
|
|
delete this->packed_selectors;
|
|
|
|
delete[] this->unpacked_selectors;
|
|
|
|
for (uint32_t i = 0; i < num_unique; i++) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_ct_list[i],
|
|
this->allocate_gpu_memory);
|
|
}
|
|
delete[] this->possible_results_ct_list;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_inputs;
|
|
|
|
int_comparison_buffer<Torus> **eq_buffers;
|
|
int_possible_results_buffer<Torus> *possible_results_buf;
|
|
int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
|
|
int_comparison_buffer<Torus> *reduction_buf;
|
|
|
|
CudaRadixCiphertextFFI *packed_selectors;
|
|
CudaRadixCiphertextFFI *unpacked_selectors;
|
|
CudaRadixCiphertextFFI *possible_results_ct_list;
|
|
CudaRadixCiphertextFFI *tmp_clear_val;
|
|
Torus *d_clear_val;
|
|
uint64_t *h_indices;
|
|
|
|
int_radix_lut<Torus> *prefix_sum_lut;
|
|
int_radix_lut<Torus> *cleanup_lut;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
uint32_t num_streams;
|
|
|
|
int_unchecked_first_index_of_clear_buffer(
|
|
CudaStreams streams, int_radix_params params, uint32_t num_inputs,
|
|
uint32_t num_blocks, uint32_t num_blocks_index, bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_inputs = num_inputs;
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
|
|
if (num_streams_to_use == 0)
|
|
num_streams_to_use = 1;
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
|
|
incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
sub_streams = new CudaStreams[num_streams_to_use];
|
|
outgoing_events =
|
|
new cudaEvent_t[num_streams_to_use * active_streams.count()];
|
|
|
|
for (uint32_t i = 0; i < num_streams_to_use; i++) {
|
|
sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < active_streams.count(); j++) {
|
|
outgoing_events[i * active_streams.count() + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
|
|
|
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
|
|
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
this->possible_results_buf = new int_possible_results_buffer<Torus>(
|
|
streams, params, packed_len, num_inputs, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
|
|
streams, params, packed_len, num_inputs, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->reduction_buf =
|
|
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->packed_selectors = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
|
|
num_inputs, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
|
|
this->packed_selectors, i, i + 1);
|
|
}
|
|
|
|
this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_ct_list[i], packed_len,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
}
|
|
|
|
this->tmp_clear_val = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->tmp_clear_val,
|
|
num_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->d_clear_val = (Torus *)cuda_malloc_with_size_tracking_async(
|
|
num_blocks * sizeof(Torus), streams.stream(0), streams.gpu_index(0),
|
|
size_tracker, allocate_gpu_memory);
|
|
|
|
h_indices = nullptr;
|
|
if (allocate_gpu_memory) {
|
|
uint32_t num_bits_in_message = log2_int(params.message_modulus);
|
|
uint32_t bits_per_packed_block = 2 * num_bits_in_message;
|
|
|
|
h_indices = new uint64_t[num_inputs * packed_len];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
uint64_t val = i;
|
|
for (uint32_t b = 0; b < packed_len; b++) {
|
|
uint64_t mask = (1ULL << bits_per_packed_block) - 1;
|
|
uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask;
|
|
h_indices[i * packed_len + b] = block_val;
|
|
}
|
|
}
|
|
}
|
|
|
|
const Torus ALREADY_SEEN = 2;
|
|
auto prefix_sum_fn = [ALREADY_SEEN](Torus current,
|
|
Torus previous) -> Torus {
|
|
if (previous == 1 || previous == ALREADY_SEEN) {
|
|
return ALREADY_SEEN;
|
|
}
|
|
return current;
|
|
};
|
|
this->prefix_sum_lut = new int_radix_lut<Torus>(
|
|
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
|
|
|
|
generate_device_accumulator_bivariate<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->prefix_sum_lut->get_lut(0, 0),
|
|
this->prefix_sum_lut->get_degree(0),
|
|
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
prefix_sum_fn, allocate_gpu_memory);
|
|
this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
|
|
|
|
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
|
|
Torus val = x % params.message_modulus;
|
|
if (val == ALREADY_SEEN)
|
|
return 0;
|
|
return val;
|
|
};
|
|
this->cleanup_lut = new int_radix_lut<Torus>(
|
|
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
|
|
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
cleanup_fn, allocate_gpu_memory);
|
|
this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
eq_buffers[i]->release(streams);
|
|
delete eq_buffers[i];
|
|
}
|
|
delete[] eq_buffers;
|
|
|
|
this->possible_results_buf->release(streams);
|
|
delete this->possible_results_buf;
|
|
|
|
this->aggregate_buf->release(streams);
|
|
delete this->aggregate_buf;
|
|
|
|
this->reduction_buf->release(streams);
|
|
delete this->reduction_buf;
|
|
|
|
this->prefix_sum_lut->release(streams);
|
|
delete this->prefix_sum_lut;
|
|
|
|
this->cleanup_lut->release(streams);
|
|
delete this->cleanup_lut;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->packed_selectors,
|
|
this->allocate_gpu_memory);
|
|
delete this->packed_selectors;
|
|
|
|
delete[] this->unpacked_selectors;
|
|
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_ct_list[i],
|
|
this->allocate_gpu_memory);
|
|
}
|
|
delete[] this->possible_results_ct_list;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->tmp_clear_val,
|
|
this->allocate_gpu_memory);
|
|
delete this->tmp_clear_val;
|
|
|
|
cuda_drop_async(this->d_clear_val, streams.stream(0), streams.gpu_index(0));
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint j = 0; j < num_streams; j++) {
|
|
for (uint k = 0; k < num_gpus; k++) {
|
|
cuda_event_destroy(outgoing_events[j * num_gpus + k],
|
|
active_streams.gpu_index(k));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
|
|
delete[] h_indices;
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_first_index_of_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_inputs;
|
|
|
|
int_comparison_buffer<Torus> **eq_buffers;
|
|
int_possible_results_buffer<Torus> *possible_results_buf;
|
|
int_aggregate_one_hot_buffer<Torus> *aggregate_buf;
|
|
int_comparison_buffer<Torus> *reduction_buf;
|
|
|
|
CudaRadixCiphertextFFI *packed_selectors;
|
|
CudaRadixCiphertextFFI *unpacked_selectors;
|
|
CudaRadixCiphertextFFI *possible_results_ct_list;
|
|
uint64_t *h_indices;
|
|
|
|
int_radix_lut<Torus> *prefix_sum_lut;
|
|
int_radix_lut<Torus> *cleanup_lut;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
uint32_t num_streams;
|
|
|
|
int_unchecked_first_index_of_buffer(CudaStreams streams,
|
|
int_radix_params params,
|
|
uint32_t num_inputs, uint32_t num_blocks,
|
|
uint32_t num_blocks_index,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_inputs = num_inputs;
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
|
|
if (num_streams_to_use == 0)
|
|
num_streams_to_use = 1;
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
|
|
incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
sub_streams = new CudaStreams[num_streams_to_use];
|
|
outgoing_events =
|
|
new cudaEvent_t[num_streams_to_use * active_streams.count()];
|
|
|
|
for (uint32_t i = 0; i < num_streams_to_use; i++) {
|
|
sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < active_streams.count(); j++) {
|
|
outgoing_events[i * active_streams.count() + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
uint32_t packed_len = (num_blocks_index + 1) / 2;
|
|
|
|
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
|
|
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
this->possible_results_buf = new int_possible_results_buffer<Torus>(
|
|
streams, params, packed_len, num_inputs, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->aggregate_buf = new int_aggregate_one_hot_buffer<Torus>(
|
|
streams, params, packed_len, num_inputs, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
this->reduction_buf =
|
|
new int_comparison_buffer<Torus>(streams, EQ, params, num_inputs, false,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
this->packed_selectors = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), this->packed_selectors,
|
|
num_inputs, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
this->unpacked_selectors = new CudaRadixCiphertextFFI[num_inputs];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
as_radix_ciphertext_slice<Torus>(&this->unpacked_selectors[i],
|
|
this->packed_selectors, i, i + 1);
|
|
}
|
|
|
|
this->possible_results_ct_list = new CudaRadixCiphertextFFI[num_inputs];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_ct_list[i], packed_len,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
}
|
|
|
|
h_indices = nullptr;
|
|
if (allocate_gpu_memory) {
|
|
uint32_t num_bits_in_message = log2_int(params.message_modulus);
|
|
uint32_t bits_per_packed_block = 2 * num_bits_in_message;
|
|
|
|
h_indices = new uint64_t[num_inputs * packed_len];
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
uint64_t val = i;
|
|
for (uint32_t b = 0; b < packed_len; b++) {
|
|
uint64_t mask = (1ULL << bits_per_packed_block) - 1;
|
|
uint64_t block_val = (val >> (b * bits_per_packed_block)) & mask;
|
|
h_indices[i * packed_len + b] = block_val;
|
|
}
|
|
}
|
|
}
|
|
|
|
const Torus ALREADY_SEEN = 2;
|
|
auto prefix_sum_fn = [ALREADY_SEEN](Torus current,
|
|
Torus previous) -> Torus {
|
|
if (previous == 1 || previous == ALREADY_SEEN) {
|
|
return ALREADY_SEEN;
|
|
}
|
|
return current;
|
|
};
|
|
this->prefix_sum_lut = new int_radix_lut<Torus>(
|
|
streams, params, 2, num_inputs, allocate_gpu_memory, size_tracker);
|
|
|
|
generate_device_accumulator_bivariate<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->prefix_sum_lut->get_lut(0, 0),
|
|
this->prefix_sum_lut->get_degree(0),
|
|
this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
prefix_sum_fn, allocate_gpu_memory);
|
|
this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
|
|
|
|
auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
|
|
Torus val = x % params.message_modulus;
|
|
if (val == ALREADY_SEEN)
|
|
return 0;
|
|
return val;
|
|
};
|
|
this->cleanup_lut = new int_radix_lut<Torus>(
|
|
streams, params, 1, num_inputs, allocate_gpu_memory, size_tracker);
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
this->cleanup_lut->get_lut(0, 0), this->cleanup_lut->get_degree(0),
|
|
this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
cleanup_fn, allocate_gpu_memory);
|
|
this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
eq_buffers[i]->release(streams);
|
|
delete eq_buffers[i];
|
|
}
|
|
delete[] eq_buffers;
|
|
|
|
this->possible_results_buf->release(streams);
|
|
delete this->possible_results_buf;
|
|
|
|
this->aggregate_buf->release(streams);
|
|
delete this->aggregate_buf;
|
|
|
|
this->reduction_buf->release(streams);
|
|
delete this->reduction_buf;
|
|
|
|
this->prefix_sum_lut->release(streams);
|
|
delete this->prefix_sum_lut;
|
|
|
|
this->cleanup_lut->release(streams);
|
|
delete this->cleanup_lut;
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
this->packed_selectors,
|
|
this->allocate_gpu_memory);
|
|
delete this->packed_selectors;
|
|
|
|
delete[] this->unpacked_selectors;
|
|
|
|
for (uint32_t i = 0; i < num_inputs; i++) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
&this->possible_results_ct_list[i],
|
|
this->allocate_gpu_memory);
|
|
}
|
|
delete[] this->possible_results_ct_list;
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint j = 0; j < num_streams; j++) {
|
|
for (uint k = 0; k < num_gpus; k++) {
|
|
cuda_event_destroy(outgoing_events[j * num_gpus + k],
|
|
active_streams.gpu_index(k));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
|
|
delete[] h_indices;
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_index_of_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_inputs;
|
|
|
|
int_comparison_buffer<Torus> **eq_buffers;
|
|
int_final_index_from_selectors_buffer<Torus> *final_index_buf;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
uint32_t num_streams;
|
|
|
|
int_unchecked_index_of_buffer(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_inputs, uint32_t num_blocks,
|
|
uint32_t num_blocks_index,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_inputs = num_inputs;
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
|
|
if (num_streams_to_use == 0)
|
|
num_streams_to_use = 1;
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
|
|
incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
sub_streams = new CudaStreams[num_streams_to_use];
|
|
outgoing_events =
|
|
new cudaEvent_t[num_streams_to_use * active_streams.count()];
|
|
|
|
for (uint32_t i = 0; i < num_streams_to_use; i++) {
|
|
sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < active_streams.count(); j++) {
|
|
outgoing_events[i * active_streams.count() + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
|
|
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
this->final_index_buf = new int_final_index_from_selectors_buffer<Torus>(
|
|
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
eq_buffers[i]->release(streams);
|
|
delete eq_buffers[i];
|
|
}
|
|
delete[] eq_buffers;
|
|
|
|
this->final_index_buf->release(streams);
|
|
delete this->final_index_buf;
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint j = 0; j < num_streams; j++) {
|
|
for (uint k = 0; k < num_gpus; k++) {
|
|
cuda_event_destroy(outgoing_events[j * num_gpus + k],
|
|
active_streams.gpu_index(k));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_unchecked_index_of_clear_buffer {
|
|
int_radix_params params;
|
|
bool allocate_gpu_memory;
|
|
uint32_t num_inputs;
|
|
|
|
int_comparison_buffer<Torus> **eq_buffers;
|
|
int_final_index_from_selectors_buffer<Torus> *final_index_buf;
|
|
|
|
CudaStreams active_streams;
|
|
CudaStreams *sub_streams;
|
|
cudaEvent_t incoming_event;
|
|
cudaEvent_t *outgoing_events;
|
|
uint32_t num_streams;
|
|
|
|
int_unchecked_index_of_clear_buffer(CudaStreams streams,
|
|
int_radix_params params,
|
|
uint32_t num_inputs, uint32_t num_blocks,
|
|
uint32_t num_blocks_index,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->allocate_gpu_memory = allocate_gpu_memory;
|
|
this->num_inputs = num_inputs;
|
|
|
|
uint32_t num_streams_to_use =
|
|
std::min((uint32_t)MAX_STREAMS_FOR_VECTOR_FIND, num_inputs);
|
|
if (num_streams_to_use == 0)
|
|
num_streams_to_use = 1;
|
|
|
|
this->num_streams = num_streams_to_use;
|
|
this->active_streams = streams.active_gpu_subset(num_blocks);
|
|
uint32_t num_gpus = active_streams.count();
|
|
|
|
incoming_event = cuda_create_event(streams.gpu_index(0));
|
|
sub_streams = new CudaStreams[num_streams_to_use];
|
|
outgoing_events = new cudaEvent_t[num_streams_to_use * num_gpus];
|
|
|
|
for (uint32_t i = 0; i < num_streams_to_use; i++) {
|
|
sub_streams[i].create_on_same_gpus(active_streams);
|
|
for (uint32_t j = 0; j < num_gpus; j++) {
|
|
outgoing_events[i * num_gpus + j] =
|
|
cuda_create_event(active_streams.gpu_index(j));
|
|
}
|
|
}
|
|
|
|
this->eq_buffers = new int_comparison_buffer<Torus> *[num_streams];
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
this->eq_buffers[i] = new int_comparison_buffer<Torus>(
|
|
streams, EQ, params, num_blocks, false, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
this->final_index_buf = new int_final_index_from_selectors_buffer<Torus>(
|
|
streams, params, num_inputs, num_blocks_index, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
eq_buffers[i]->release(streams);
|
|
delete eq_buffers[i];
|
|
}
|
|
delete[] eq_buffers;
|
|
|
|
this->final_index_buf->release(streams);
|
|
delete this->final_index_buf;
|
|
|
|
cuda_event_destroy(incoming_event, streams.gpu_index(0));
|
|
|
|
uint32_t num_gpus = active_streams.count();
|
|
for (uint j = 0; j < num_streams; j++) {
|
|
for (uint k = 0; k < num_gpus; k++) {
|
|
cuda_event_destroy(outgoing_events[j * num_gpus + k],
|
|
active_streams.gpu_index(k));
|
|
}
|
|
}
|
|
delete[] outgoing_events;
|
|
|
|
for (uint32_t i = 0; i < num_streams; i++) {
|
|
sub_streams[i].release();
|
|
}
|
|
delete[] sub_streams;
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|