mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
2586 lines
107 KiB
C++
2586 lines
107 KiB
C++
#ifndef CUDA_INTEGER_UTILITIES_H
|
|
#define CUDA_INTEGER_UTILITIES_H
|
|
|
|
#include "integer.h"
|
|
#include "integer/radix_ciphertext.cuh"
|
|
#include "integer/radix_ciphertext.h"
|
|
#include "keyswitch/keyswitch.h"
|
|
#include "pbs/programmable_bootstrap.cuh"
|
|
#include "utils/helper_multi_gpu.cuh"
|
|
#include <cmath>
|
|
#include <functional>
|
|
#include <queue>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include "crypto/keyswitch.cuh"
|
|
|
|
class NoiseLevel {
|
|
public:
|
|
// Constants equivalent to the Rust code
|
|
static const uint64_t NOMINAL = 1;
|
|
static const uint64_t ZERO = 0;
|
|
static const uint64_t UNKNOWN = std::numeric_limits<uint64_t>::max();
|
|
};
|
|
|
|
#ifdef DEBUG
|
|
#define CHECK_NOISE_LEVEL(noise_level_expr, msg_mod, carry_mod) \
|
|
do { \
|
|
if ((msg_mod) == 2 && (carry_mod) == 2) { \
|
|
constexpr int max_noise_level = 3; \
|
|
if ((noise_level_expr) > max_noise_level) \
|
|
PANIC("Cuda error: noise exceeds maximum authorized value for 1_1 " \
|
|
"parameters"); \
|
|
} else if ((msg_mod) == 4 && (carry_mod) == 4) { \
|
|
constexpr int max_noise_level = 5; \
|
|
if ((noise_level_expr) > max_noise_level) \
|
|
PANIC("Cuda error: noise exceeds maximum authorized value for 2_2 " \
|
|
"parameters"); \
|
|
} else if ((msg_mod) == 8 && (carry_mod) == 8) { \
|
|
constexpr int max_noise_level = 9; \
|
|
if ((noise_level_expr) > max_noise_level) \
|
|
PANIC("Cuda error: noise exceeds maximum authorized value for 3_3 " \
|
|
"parameters"); \
|
|
} else if ((msg_mod) == 0 && (carry_mod) == 0) { \
|
|
break; \
|
|
} else if ((msg_mod) == 4 && (carry_mod) == 32) { \
|
|
break; \
|
|
} else { \
|
|
PANIC("Invalid message modulus or carry modulus") \
|
|
} \
|
|
} while (0)
|
|
#else
|
|
#define CHECK_NOISE_LEVEL(noise_level_expr, message_modulus, carry_modulus) \
|
|
do { \
|
|
} while (0)
|
|
#endif
|
|
|
|
template <typename Torus>
|
|
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
|
|
uint32_t value, uint32_t blocks_count,
|
|
uint32_t lwe_size);
|
|
void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
|
|
size_t *h_lwe_idx_out,
|
|
int32_t *h_smart_copy_in,
|
|
int32_t *h_smart_copy_out, size_t ch_amount,
|
|
uint32_t num_radix, uint32_t num_blocks,
|
|
size_t chunk_size, size_t message_max,
|
|
size_t &total_count, size_t &message_count,
|
|
size_t &carry_count, size_t &sm_copy_count);
|
|
/*
|
|
* generate bivariate accumulator (lut) for device pointer
|
|
* stream - cuda stream
|
|
* acc_bivariate - device pointer for bivariate accumulator
|
|
* ...
|
|
* f - wrapping function with two Torus inputs
|
|
*/
|
|
template <typename Torus>
|
|
void generate_device_accumulator_bivariate(
|
|
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
|
|
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
|
|
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
|
std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated);
|
|
|
|
template <typename Torus>
|
|
void generate_device_accumulator_bivariate_with_factor(
|
|
cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
|
|
uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
|
|
uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
|
|
std::function<Torus(Torus, Torus)> f, int factor,
|
|
bool gpu_memory_allocated);
|
|
|
|
template <typename Torus>
|
|
void generate_device_accumulator_with_encoding(
|
|
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
|
|
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
|
uint32_t input_message_modulus, uint32_t input_carry_modulus,
|
|
uint32_t output_message_modulus, uint32_t output_carry_modulus,
|
|
std::function<Torus(Torus)> f, bool gpu_memory_allocated);
|
|
|
|
template <typename Torus>
|
|
void generate_device_accumulator_no_encoding(
|
|
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
|
|
uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
|
|
uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
|
|
bool gpu_memory_allocated);
|
|
|
|
/*
|
|
* generate univariate accumulator (lut) for device pointer
|
|
* stream - cuda stream
|
|
* acc - device pointer for univariate accumulator
|
|
* ...
|
|
* f - evaluating function with one Torus input
|
|
*/
|
|
template <typename Torus>
|
|
void generate_device_accumulator(
|
|
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
|
|
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
|
uint32_t message_modulus, uint32_t carry_modulus,
|
|
std::function<Torus(Torus)> f, bool gpu_memory_allocated);
|
|
|
|
template <typename Torus>
|
|
void generate_many_lut_device_accumulator(
|
|
cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degrees,
|
|
uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
|
|
uint32_t message_modulus, uint32_t carry_modulus,
|
|
std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated);
|
|
|
|
struct radix_columns {
|
|
std::vector<uint32_t> columns_counter;
|
|
uint32_t num_blocks;
|
|
uint32_t num_radix_in_vec;
|
|
uint32_t chunk_size;
|
|
radix_columns(const uint64_t *const input_degrees, uint32_t num_blocks,
|
|
uint32_t num_radix_in_vec, uint32_t chunk_size,
|
|
bool &needs_processing)
|
|
: num_blocks(num_blocks), num_radix_in_vec(num_radix_in_vec),
|
|
chunk_size(chunk_size) {
|
|
needs_processing = false;
|
|
columns_counter.resize(num_blocks, 0);
|
|
for (uint32_t i = 0; i < num_radix_in_vec; ++i) {
|
|
for (uint32_t j = 0; j < num_blocks; ++j) {
|
|
if (input_degrees[i * num_blocks + j])
|
|
columns_counter[j] += 1;
|
|
}
|
|
}
|
|
|
|
for (uint32_t i = 0; i < num_blocks; ++i) {
|
|
if (columns_counter[i] > chunk_size) {
|
|
needs_processing = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void next_accumulation(uint32_t &total_ciphertexts,
|
|
uint32_t &message_ciphertexts,
|
|
bool &needs_processing) {
|
|
message_ciphertexts = 0;
|
|
total_ciphertexts = 0;
|
|
needs_processing = false;
|
|
for (int i = num_blocks - 1; i > 0; --i) {
|
|
uint32_t cur_count = columns_counter[i];
|
|
uint32_t prev_count = columns_counter[i - 1];
|
|
uint32_t new_count = 0;
|
|
|
|
// accumulated_blocks from current columns
|
|
new_count += cur_count / chunk_size;
|
|
// all accumulated message blocks needs pbs
|
|
message_ciphertexts += new_count;
|
|
// carry blocks from previous columns
|
|
new_count += prev_count / chunk_size;
|
|
// both carry and message blocks that needs pbs
|
|
total_ciphertexts += new_count;
|
|
// now add remaining non accumulated blocks that does not require pbs
|
|
new_count += cur_count % chunk_size;
|
|
|
|
columns_counter[i] = new_count;
|
|
|
|
if (new_count > chunk_size)
|
|
needs_processing = true;
|
|
}
|
|
|
|
// now do it for 0th block
|
|
uint32_t new_count = columns_counter[0] / chunk_size;
|
|
message_ciphertexts += new_count;
|
|
total_ciphertexts += new_count;
|
|
new_count += columns_counter[0] % chunk_size;
|
|
columns_counter[0] = new_count;
|
|
|
|
if (new_count > chunk_size) {
|
|
needs_processing = true;
|
|
}
|
|
}
|
|
};
|
|
|
|
inline void calculate_final_degrees(uint64_t *const out_degrees,
|
|
const uint64_t *const input_degrees,
|
|
uint32_t num_blocks,
|
|
uint32_t num_radix_in_vec,
|
|
uint32_t chunk_size,
|
|
uint64_t message_modulus) {
|
|
|
|
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
|
|
return std::min(message_modulus - 1, degree);
|
|
};
|
|
std::vector<std::queue<uint64_t>> columns(num_blocks);
|
|
for (uint32_t i = 0; i < num_radix_in_vec; ++i) {
|
|
for (uint32_t j = 0; j < num_blocks; ++j) {
|
|
if (input_degrees[i * num_blocks + j])
|
|
columns[j].push(input_degrees[i * num_blocks + j]);
|
|
}
|
|
}
|
|
|
|
for (uint32_t i = 0; i < num_blocks; ++i) {
|
|
auto &col = columns[i];
|
|
while (col.size() > 1) {
|
|
uint32_t cur_degree = 0;
|
|
uint32_t mn = std::min(chunk_size, (uint32_t)col.size());
|
|
for (int j = 0; j < mn; ++j) {
|
|
cur_degree += col.front();
|
|
col.pop();
|
|
}
|
|
const uint64_t new_degree = get_degree(cur_degree);
|
|
col.push(new_degree);
|
|
if ((i + 1) < num_blocks) {
|
|
columns[i + 1].push(new_degree);
|
|
}
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < num_blocks; i++) {
|
|
out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
|
|
}
|
|
}
|
|
|
|
struct int_radix_params {
|
|
PBS_TYPE pbs_type;
|
|
uint32_t glwe_dimension;
|
|
uint32_t polynomial_size;
|
|
uint32_t big_lwe_dimension;
|
|
uint32_t small_lwe_dimension;
|
|
uint32_t ks_level;
|
|
uint32_t ks_base_log;
|
|
uint32_t pbs_level;
|
|
uint32_t pbs_base_log;
|
|
uint32_t grouping_factor;
|
|
uint32_t message_modulus;
|
|
uint32_t carry_modulus;
|
|
PBS_MS_REDUCTION_T noise_reduction_type;
|
|
|
|
int_radix_params(PBS_TYPE pbs_type, uint32_t glwe_dimension,
|
|
uint32_t polynomial_size, uint32_t big_lwe_dimension,
|
|
uint32_t small_lwe_dimension, uint32_t ks_level,
|
|
uint32_t ks_base_log, uint32_t pbs_level,
|
|
uint32_t pbs_base_log, uint32_t grouping_factor,
|
|
uint32_t message_modulus, uint32_t carry_modulus,
|
|
PBS_MS_REDUCTION_T noise_reduction_type)
|
|
|
|
: pbs_type(pbs_type), glwe_dimension(glwe_dimension),
|
|
polynomial_size(polynomial_size), big_lwe_dimension(big_lwe_dimension),
|
|
small_lwe_dimension(small_lwe_dimension), ks_level(ks_level),
|
|
ks_base_log(ks_base_log), pbs_level(pbs_level),
|
|
pbs_base_log(pbs_base_log), grouping_factor(grouping_factor),
|
|
message_modulus(message_modulus), carry_modulus(carry_modulus),
|
|
noise_reduction_type(noise_reduction_type){};
|
|
|
|
int_radix_params() = default;
|
|
|
|
void print() {
|
|
printf("pbs_type: %u, glwe_dimension: %u, "
|
|
"polynomial_size: %u, "
|
|
"big_lwe_dimension: %u, "
|
|
"small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
|
|
"%u, pbs_base_log: "
|
|
"%u, grouping_factor: %u, message_modulus: %u, carry_modulus: %u\n",
|
|
pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension,
|
|
small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log,
|
|
grouping_factor, message_modulus, carry_modulus);
|
|
};
|
|
};
|
|
|
|
// Store things needed to apply LUTs
|
|
template <typename InputTorus, typename OutputTorus>
|
|
struct int_radix_lut_custom_input_output {
|
|
int_radix_params params;
|
|
// The number of blocks to be processed by the LUT. Can be
|
|
// smaller than the actual num_input_blocks because some LUT types
|
|
// (like noise squashing), perform packing.
|
|
uint32_t num_blocks = 0;
|
|
// The number of blocks of the input ciphertext. For noise
|
|
// squashing these blocks are packed into num_blocks
|
|
uint32_t num_input_blocks = 0;
|
|
// Number of LUTs to store in this structure
|
|
uint32_t num_luts = 0;
|
|
// ManyLUT is the mechanism to apply several LUTs in a single PBS
|
|
uint32_t num_many_lut = 1;
|
|
// The LWE dimension of the KS output / PBS input. Initialized
|
|
// to the max value so that we crash if this value is set incorrectly
|
|
// by the caller
|
|
uint32_t input_big_lwe_dimension = (uint32_t)-1;
|
|
|
|
// Tracks the degree of each LUT and the max degree on CPU
|
|
// The max degree is (message_modulus * carry_modulus - 1) except for many lut
|
|
// for which it's different
|
|
uint64_t *degrees = nullptr;
|
|
uint64_t *max_degrees = nullptr;
|
|
|
|
CudaStreams active_streams;
|
|
bool mem_reuse = false;
|
|
|
|
// There will be one buffer on each GPU in multi-GPU computations
|
|
// (same for tmp lwe arrays)
|
|
std::vector<int8_t *> buffer;
|
|
|
|
// These arrays will reside on all GPUs
|
|
// lut could actually be allocated & initialized GPU per GPU but this is not
|
|
// done at the moment
|
|
std::vector<OutputTorus *> lut_vec;
|
|
std::vector<InputTorus *> lut_indexes_vec;
|
|
InputTorus *h_lut_indexes = nullptr;
|
|
// All tmp lwe arrays and index arrays for lwe contain the total
|
|
// amount of blocks to be computed on, there is no split between GPUs
|
|
// for the moment
|
|
InputTorus *lwe_indexes_in = nullptr;
|
|
InputTorus *lwe_indexes_out = nullptr;
|
|
InputTorus *h_lwe_indexes_in = nullptr;
|
|
InputTorus *h_lwe_indexes_out = nullptr;
|
|
// Enable optimizations if lwe_indexes_(in/out) are trivial
|
|
bool using_trivial_lwe_indexes = true;
|
|
// lwe_trivial_indexes is the intermediary index we need in case
|
|
// lwe_indexes_in != lwe_indexes_out
|
|
InputTorus *lwe_trivial_indexes = nullptr;
|
|
|
|
// buffer to store packed message bits of a radix ciphertext
|
|
CudaRadixCiphertextFFI *tmp_lwe_before_ks = nullptr;
|
|
|
|
/// For multi GPU execution we create vectors of pointers for inputs and
|
|
/// outputs
|
|
std::vector<InputTorus *> lwe_array_in_vec;
|
|
std::vector<InputTorus *> lwe_after_ks_vec;
|
|
std::vector<OutputTorus *> lwe_after_pbs_vec;
|
|
std::vector<InputTorus *> lwe_trivial_indexes_vec;
|
|
std::vector<ks_mem<InputTorus> *>
|
|
ks_tmp_buf_vec; // buffers on each GPU to store keyswitch temporary data
|
|
|
|
std::vector<InputTorus *> lwe_aligned_vec;
|
|
|
|
bool gpu_memory_allocated;
|
|
|
|
CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier;
|
|
CudaStreamsBarrier multi_gpu_gather_barrier;
|
|
|
|
// Setup the LUT configuration:
|
|
// input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input
|
|
// params: cryptographic parameters of the PBS output
|
|
// num_luts: number of LUTs (or many-LUT sets) in this structure
|
|
// num_many_lut: number of LUTs to apply in a single PBS pass
|
|
// num_radix_blocks: number of blocks in the radix integer
|
|
void setup_config_and_degrees(CudaStreams streams,
|
|
uint32_t input_big_lwe_dimension,
|
|
int_radix_params params, uint32_t num_luts,
|
|
uint32_t num_many_lut,
|
|
uint32_t num_radix_blocks,
|
|
uint32_t num_input_blocks,
|
|
bool allocate_gpu_memory) {
|
|
this->params = params;
|
|
this->num_blocks = num_radix_blocks;
|
|
this->num_luts = num_luts;
|
|
this->num_many_lut = num_many_lut;
|
|
this->input_big_lwe_dimension = input_big_lwe_dimension;
|
|
this->num_input_blocks = num_input_blocks;
|
|
this->gpu_memory_allocated = allocate_gpu_memory;
|
|
|
|
this->active_streams =
|
|
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
|
}
|
|
|
|
void setup_degrees() {
|
|
this->degrees =
|
|
(uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t));
|
|
this->max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
|
|
}
|
|
|
|
void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
|
|
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
|
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
|
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
|
|
|
for (uint i = 0; i < active_streams.count(); i++) {
|
|
cuda_set_device(active_streams.gpu_index(i));
|
|
int8_t *gpu_pbs_buffer;
|
|
auto num_blocks_on_gpu = std::min(
|
|
(int)num_radix_blocks,
|
|
std::max(threshold, get_num_inputs_on_gpu(num_radix_blocks, i,
|
|
active_streams.count())));
|
|
|
|
uint64_t size = 0;
|
|
execute_scratch_pbs<OutputTorus>(
|
|
active_streams.stream(i), active_streams.gpu_index(i),
|
|
&gpu_pbs_buffer, params.glwe_dimension, params.small_lwe_dimension,
|
|
params.polynomial_size, params.pbs_level, params.grouping_factor,
|
|
num_blocks_on_gpu, params.pbs_type, allocate_gpu_memory,
|
|
params.noise_reduction_type, size);
|
|
if (i == 0) {
|
|
size_tracker += size;
|
|
}
|
|
buffer.push_back(gpu_pbs_buffer);
|
|
}
|
|
|
|
// This buffer is created with num_input_blocks since it
|
|
// stores the ciphertext before KS or packing.
|
|
tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<InputTorus>(
|
|
active_streams.stream(0), active_streams.gpu_index(0),
|
|
tmp_lwe_before_ks, num_input_blocks, input_big_lwe_dimension,
|
|
size_tracker, allocate_gpu_memory);
|
|
}
|
|
|
|
void alloc_and_init_multi_gpu_buffers(int_radix_params params,
|
|
uint32_t num_radix_blocks,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
GPU_ASSERT(lwe_array_in_vec.empty(), "Multi GPU buffers already allocated");
|
|
|
|
/// With multiple GPUs we allocate arrays to be pushed to the vectors and
|
|
/// copy data on each GPU then when we gather data to GPU 0 we can copy
|
|
/// back to the original indexing
|
|
multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
|
|
num_radix_blocks, params.big_lwe_dimension + 1,
|
|
size_tracker, params.pbs_type,
|
|
allocate_gpu_memory);
|
|
multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
|
|
num_radix_blocks, params.small_lwe_dimension + 1,
|
|
size_tracker, params.pbs_type,
|
|
allocate_gpu_memory);
|
|
if (num_many_lut > 1) {
|
|
multi_gpu_alloc_lwe_many_lut_output_async(
|
|
active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
|
|
params.big_lwe_dimension + 1, size_tracker, params.pbs_type,
|
|
allocate_gpu_memory);
|
|
} else {
|
|
multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
|
|
num_radix_blocks, params.big_lwe_dimension + 1,
|
|
size_tracker, params.pbs_type,
|
|
allocate_gpu_memory);
|
|
}
|
|
multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
|
|
num_radix_blocks, size_tracker,
|
|
allocate_gpu_memory);
|
|
cuda_synchronize_stream(active_streams.stream(0),
|
|
active_streams.gpu_index(0));
|
|
|
|
// This call will not copy if allocate_gpu_memory is false
|
|
// thus it's safe to call it on a null source pointer
|
|
multi_gpu_copy_array_async(active_streams, lwe_trivial_indexes_vec,
|
|
lwe_trivial_indexes, num_radix_blocks,
|
|
allocate_gpu_memory);
|
|
}
|
|
|
|
void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
|
|
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
|
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
|
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
|
|
|
auto inputs_on_gpu = std::min(
|
|
(int)num_input_blocks,
|
|
std::max(threshold, get_num_inputs_on_gpu(num_input_blocks, 0,
|
|
active_streams.count())));
|
|
|
|
if (inputs_on_gpu >= get_threshold_ks_gemm()) {
|
|
for (auto i = 0; i < active_streams.count(); ++i) {
|
|
ks_mem<InputTorus> *ks_buffer;
|
|
uint64_t sub_size_tracker = scratch_cuda_keyswitch<InputTorus>(
|
|
active_streams.stream(i), active_streams.gpu_index(i), &ks_buffer,
|
|
input_big_lwe_dimension, params.small_lwe_dimension, num_blocks,
|
|
gpu_memory_allocated);
|
|
|
|
if (i == 0) {
|
|
size_tracker += sub_size_tracker;
|
|
}
|
|
ks_tmp_buf_vec.push_back(ks_buffer);
|
|
}
|
|
}
|
|
}
|
|
|
|
void setup_mem_reuse(uint32_t num_radix_blocks,
|
|
int_radix_lut_custom_input_output *base_lut_object) {
|
|
// base lut object should have bigger or equal memory than current one
|
|
if (num_radix_blocks > base_lut_object->num_blocks)
|
|
PANIC("Cuda error: lut does not have enough blocks")
|
|
// pbs
|
|
buffer = base_lut_object->buffer;
|
|
// Keyswitch
|
|
tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
|
|
|
|
/// With multiple GPUs we allocate arrays to be pushed to the vectors and
|
|
/// copy data on each GPU then when we gather data to GPU 0 we can copy back
|
|
/// to the original indexing
|
|
lwe_array_in_vec = base_lut_object->lwe_array_in_vec;
|
|
lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec;
|
|
lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec;
|
|
lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;
|
|
|
|
ks_tmp_buf_vec = base_lut_object->ks_tmp_buf_vec;
|
|
|
|
mem_reuse = true;
|
|
}
|
|
|
|
void setup_lwe_trivial_indices(uint32_t num_radix_blocks,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
// lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
|
|
// by default
|
|
lwe_indexes_in = (InputTorus *)cuda_malloc_with_size_tracking_async(
|
|
num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
|
|
active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);
|
|
lwe_indexes_out = (InputTorus *)cuda_malloc_with_size_tracking_async(
|
|
num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
|
|
active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);
|
|
lwe_trivial_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
|
|
num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
|
|
active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);
|
|
|
|
h_lwe_indexes_in =
|
|
(InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));
|
|
h_lwe_indexes_out =
|
|
(InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));
|
|
|
|
for (int i = 0; i < num_radix_blocks; i++)
|
|
h_lwe_indexes_in[i] = i;
|
|
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(InputTorus),
|
|
active_streams.stream(0), active_streams.gpu_index(0),
|
|
allocate_gpu_memory);
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lwe_indexes_out, h_lwe_indexes_in,
|
|
num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
|
|
active_streams.gpu_index(0), allocate_gpu_memory);
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lwe_trivial_indexes, h_lwe_indexes_in,
|
|
num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
|
|
active_streams.gpu_index(0), allocate_gpu_memory);
|
|
memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
|
|
num_radix_blocks * sizeof(InputTorus));
|
|
|
|
h_lut_indexes =
|
|
(InputTorus *)(calloc(num_radix_blocks, sizeof(InputTorus)));
|
|
}
|
|
|
|
void setup_multi_gpu(int_radix_params params, uint32_t num_radix_blocks,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
|
|
if (!mem_reuse)
|
|
alloc_and_init_multi_gpu_buffers(params, num_radix_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
if (active_streams.count() > 1) {
|
|
multi_gpu_gather_barrier.create_on(active_streams);
|
|
multi_gpu_broadcast_barrier.create_on(active_streams);
|
|
multi_gpu_scatter_barrier.create_on(active_streams);
|
|
}
|
|
}
|
|
|
|
int_radix_lut_custom_input_output(CudaStreams streams,
|
|
int_radix_params params, uint32_t num_luts,
|
|
uint32_t num_radix_blocks,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
|
|
setup_config_and_degrees(streams, params.big_lwe_dimension, params,
|
|
num_luts, 1, num_radix_blocks, num_radix_blocks,
|
|
allocate_gpu_memory);
|
|
|
|
setup_degrees();
|
|
|
|
allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
allocate_luts_and_indexes(num_radix_blocks, size_tracker);
|
|
|
|
setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
// Constructor for noise squashing LUT which packs the input
|
|
// ciphertext of num_input_blocks blocks into a new one with fewer blocks,
|
|
// num_radix_blocks
|
|
int_radix_lut_custom_input_output(CudaStreams streams,
|
|
uint32_t input_big_lwe_dimension,
|
|
int_radix_params params, uint32_t num_luts,
|
|
uint32_t num_radix_blocks,
|
|
uint32_t num_input_blocks,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
|
|
setup_config_and_degrees(streams, input_big_lwe_dimension, params, num_luts,
|
|
1, num_radix_blocks, num_input_blocks,
|
|
allocate_gpu_memory);
|
|
|
|
setup_degrees();
|
|
|
|
allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
allocate_luts_and_indexes(num_radix_blocks, size_tracker);
|
|
|
|
setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
// constructor to reuse memory
|
|
int_radix_lut_custom_input_output(
|
|
CudaStreams streams, int_radix_params params, uint32_t num_luts,
|
|
uint32_t num_radix_blocks,
|
|
int_radix_lut_custom_input_output *base_lut_object,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
setup_config_and_degrees(streams, params.big_lwe_dimension, params,
|
|
num_luts, 1, num_radix_blocks, num_radix_blocks,
|
|
allocate_gpu_memory);
|
|
|
|
setup_degrees();
|
|
|
|
setup_mem_reuse(num_radix_blocks, base_lut_object);
|
|
|
|
allocate_luts_and_indexes(num_radix_blocks, size_tracker);
|
|
|
|
setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
// Construction for many luts
|
|
int_radix_lut_custom_input_output(CudaStreams streams,
|
|
int_radix_params params, uint32_t num_luts,
|
|
uint32_t num_radix_blocks,
|
|
uint32_t num_many_lut,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
|
|
setup_config_and_degrees(streams, params.big_lwe_dimension, params,
|
|
num_luts, num_many_lut, num_radix_blocks,
|
|
num_radix_blocks, allocate_gpu_memory);
|
|
|
|
setup_degrees();
|
|
|
|
allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
allocate_luts_and_indexes(num_radix_blocks, size_tracker);
|
|
|
|
setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
|
|
setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
// Return a pointer to idx-ith lut at gpu_index's global memory
|
|
OutputTorus *get_lut(uint32_t gpu_index, size_t idx) {
|
|
if (!gpu_memory_allocated)
|
|
return nullptr;
|
|
auto lut = lut_vec[gpu_index];
|
|
size_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
|
|
|
|
if (lut == nullptr)
|
|
PANIC("Cuda error: invalid lut pointer")
|
|
return &lut[idx * lut_size];
|
|
}
|
|
|
|
// Return a pointer to idx-ith degree
|
|
uint64_t *get_degree(size_t idx) {
|
|
GPU_ASSERT(idx < num_luts, "Invalid degree requested");
|
|
return °rees[num_many_lut * idx];
|
|
}
|
|
|
|
// Return a pointer to idx-ith max degree
|
|
uint64_t *get_max_degree(size_t idx) {
|
|
GPU_ASSERT(idx < num_luts, "Invalid degree requested");
|
|
return &max_degrees[idx];
|
|
}
|
|
|
|
// Return a pointer to idx-ith lut indexes at gpu_index's global memory
|
|
InputTorus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
|
|
if (!gpu_memory_allocated)
|
|
return nullptr;
|
|
auto lut_indexes = lut_indexes_vec[gpu_index];
|
|
return &lut_indexes[ind];
|
|
}
|
|
|
|
// Allocate LUT
|
|
// LUT is used as a trivial encryption and must be initialized outside
|
|
// this constructor
|
|
void allocate_luts_and_indexes(uint32_t num_radix_blocks,
|
|
uint64_t &size_tracker) {
|
|
uint64_t lut_indexes_size = num_radix_blocks * sizeof(InputTorus);
|
|
uint64_t lut_buffer_size = (params.glwe_dimension + 1) *
|
|
params.polynomial_size * sizeof(OutputTorus);
|
|
|
|
for (uint i = 0; i < active_streams.count(); i++) {
|
|
auto lut = (OutputTorus *)cuda_malloc_with_size_tracking_async(
|
|
num_luts * lut_buffer_size, active_streams.stream(i),
|
|
active_streams.gpu_index(i), size_tracker, gpu_memory_allocated);
|
|
auto lut_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
|
|
lut_indexes_size, active_streams.stream(i),
|
|
active_streams.gpu_index(i), size_tracker, gpu_memory_allocated);
|
|
// lut_indexes is initialized to 0 by default
|
|
// if a different behavior is wanted, it should be rewritten later
|
|
cuda_memset_with_size_tracking_async(
|
|
lut_indexes, 0, lut_indexes_size, active_streams.stream(i),
|
|
active_streams.gpu_index(i), gpu_memory_allocated);
|
|
lut_vec.push_back(lut);
|
|
lut_indexes_vec.push_back(lut_indexes);
|
|
}
|
|
}
|
|
// If this function is called we assume the lwe_indexes_(in/out) are not the
|
|
// trivial anymore and thus we disable optimizations
|
|
void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
|
|
InputTorus *h_indexes_in, InputTorus *h_indexes_out) {
|
|
|
|
memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(InputTorus));
|
|
memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(InputTorus));
|
|
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lwe_indexes_in, h_lwe_indexes_in, num_blocks * sizeof(InputTorus),
|
|
stream, gpu_index, gpu_memory_allocated);
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lwe_indexes_out, h_lwe_indexes_out, num_blocks * sizeof(InputTorus),
|
|
stream, gpu_index, gpu_memory_allocated);
|
|
|
|
using_trivial_lwe_indexes = false;
|
|
}
|
|
|
|
// Broadcast luts from device gpu_indexes[0] to all active gpus
|
|
void broadcast_lut(CudaStreams new_active_streams,
|
|
bool broadcast_lut_values = true) {
|
|
PANIC_IF_FALSE(new_active_streams.gpu_index(0) ==
|
|
active_streams.gpu_index(0),
|
|
"Broadcasting LUTs can only be done using the same GPUs "
|
|
" originally assigned to the int_radix_lut");
|
|
|
|
// We only do broadcast if there are more than 1 active GPU
|
|
if (new_active_streams.count() == 1)
|
|
return;
|
|
|
|
GPU_ASSERT(active_streams.count() >= new_active_streams.count(),
|
|
"To broadcast a LUT to a GPU set, it must have been initialized "
|
|
"with a GPU set that is greater or equal in size");
|
|
|
|
int active_device = cuda_get_device();
|
|
|
|
uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;
|
|
|
|
// Wait for GPU 0 to receive all data from previous computations
|
|
// that may have occurred on different GPUs
|
|
multi_gpu_broadcast_barrier.local_streams_wait_for_stream_0(
|
|
new_active_streams);
|
|
|
|
// The LUT and its indexes reside on GPU 0
|
|
// these were filled by calls to generate_device_accumulator
|
|
// due to the previous synchronization, we're sure these buffers have
|
|
// finished copying to GPU 0 from CPU
|
|
auto src_lut = lut_vec[0];
|
|
auto src_lut_indexes = lut_indexes_vec[0];
|
|
|
|
for (uint i = 1; i < new_active_streams.count(); i++) {
|
|
PANIC_IF_FALSE(
|
|
new_active_streams.gpu_index(i) == active_streams.gpu_index(i),
|
|
"Broadcasting LUTs can only be done to the LUT streams or to new "
|
|
"streams that reside on the same GPUs as the source LUTs");
|
|
|
|
// Check for redundant copies
|
|
#ifndef DEBUG_FAKE_MULTI_GPU
|
|
PANIC_IF_FALSE(new_active_streams.gpu_index(i) !=
|
|
new_active_streams.gpu_index(0),
|
|
"Broadcast LUT does not handle duplicate GPUs in the "
|
|
"active streams set");
|
|
#endif
|
|
|
|
if (broadcast_lut_values) {
|
|
auto dst_lut = lut_vec[i];
|
|
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
|
dst_lut, src_lut, num_luts * lut_size * sizeof(OutputTorus),
|
|
new_active_streams.stream(i), new_active_streams.gpu_index(i),
|
|
gpu_memory_allocated);
|
|
}
|
|
auto dst_lut_indexes = lut_indexes_vec[i];
|
|
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
|
dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(InputTorus),
|
|
new_active_streams.stream(i), new_active_streams.gpu_index(i),
|
|
gpu_memory_allocated);
|
|
}
|
|
|
|
// Ensure the device set at the end of this method is the same as it was
|
|
// set at the beginning
|
|
cuda_set_device(active_device);
|
|
}
|
|
|
|
void allocate_lwe_vector_for_non_trivial_indexes(
|
|
CudaStreams streams, uint64_t max_num_radix_blocks,
|
|
uint64_t &size_tracker, bool allocate_gpu_memory) {
|
|
|
|
int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
|
|
? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
|
|
: THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
|
|
|
|
// We need to create the auxiliary array only in GPU 0
|
|
if (active_streams.count() > 1) {
|
|
lwe_aligned_vec.resize(active_streams.count());
|
|
for (uint i = 0; i < active_streams.count(); i++) {
|
|
uint64_t size_tracker_on_array_i = 0;
|
|
auto inputs_on_gpu = std::min(
|
|
(int)max_num_radix_blocks,
|
|
std::max(threshold, get_num_inputs_on_gpu(max_num_radix_blocks, i,
|
|
active_streams.count())));
|
|
InputTorus *d_array =
|
|
(InputTorus *)cuda_malloc_with_size_tracking_async(
|
|
inputs_on_gpu * (params.big_lwe_dimension + 1) *
|
|
sizeof(InputTorus),
|
|
streams.stream(0), streams.gpu_index(0),
|
|
size_tracker_on_array_i, allocate_gpu_memory);
|
|
lwe_aligned_vec[i] = d_array;
|
|
size_tracker += size_tracker_on_array_i;
|
|
}
|
|
}
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
|
|
"Lut vec and Lut vec indexes must have the same size");
|
|
for (uint i = 0; i < lut_vec.size(); i++) {
|
|
cuda_drop_with_size_tracking_async(lut_vec[i], active_streams.stream(i),
|
|
active_streams.gpu_index(i),
|
|
gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(
|
|
lut_indexes_vec[i], active_streams.stream(i),
|
|
active_streams.gpu_index(i), gpu_memory_allocated);
|
|
}
|
|
cuda_drop_with_size_tracking_async(lwe_indexes_in, active_streams.stream(0),
|
|
active_streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(
|
|
lwe_indexes_out, active_streams.stream(0), active_streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(
|
|
lwe_trivial_indexes, active_streams.stream(0),
|
|
active_streams.gpu_index(0), gpu_memory_allocated);
|
|
|
|
cuda_synchronize_stream(active_streams.stream(0),
|
|
active_streams.gpu_index(0));
|
|
lut_vec.clear();
|
|
lut_indexes_vec.clear();
|
|
free(h_lwe_indexes_in);
|
|
free(h_lwe_indexes_out);
|
|
|
|
if (active_streams.count() > 1) {
|
|
active_streams.synchronize();
|
|
multi_gpu_gather_barrier.release();
|
|
multi_gpu_broadcast_barrier.release();
|
|
multi_gpu_scatter_barrier.release();
|
|
}
|
|
|
|
if (!mem_reuse) {
|
|
release_radix_ciphertext_async(active_streams.stream(0),
|
|
active_streams.gpu_index(0),
|
|
tmp_lwe_before_ks, gpu_memory_allocated);
|
|
for (int i = 0; i < buffer.size(); i++) {
|
|
switch (params.pbs_type) {
|
|
case MULTI_BIT:
|
|
cleanup_cuda_multi_bit_programmable_bootstrap(
|
|
active_streams.stream(i), active_streams.gpu_index(i),
|
|
&buffer[i]);
|
|
break;
|
|
case CLASSICAL:
|
|
cleanup_cuda_programmable_bootstrap(active_streams.stream(i),
|
|
active_streams.gpu_index(i),
|
|
&buffer[i]);
|
|
break;
|
|
default:
|
|
PANIC("Cuda error (PBS): unknown PBS type. ")
|
|
}
|
|
cuda_synchronize_stream(active_streams.stream(i),
|
|
active_streams.gpu_index(i));
|
|
}
|
|
delete tmp_lwe_before_ks;
|
|
buffer.clear();
|
|
|
|
if (gpu_memory_allocated) {
|
|
multi_gpu_release_async(active_streams, lwe_array_in_vec);
|
|
multi_gpu_release_async(active_streams, lwe_after_ks_vec);
|
|
multi_gpu_release_async(active_streams, lwe_after_pbs_vec);
|
|
multi_gpu_release_async(active_streams, lwe_trivial_indexes_vec);
|
|
}
|
|
lwe_array_in_vec.clear();
|
|
lwe_after_ks_vec.clear();
|
|
lwe_after_pbs_vec.clear();
|
|
lwe_trivial_indexes_vec.clear();
|
|
if (lwe_aligned_vec.size() > 0) {
|
|
for (uint i = 0; i < active_streams.count(); i++) {
|
|
cuda_drop_with_size_tracking_async(
|
|
lwe_aligned_vec[i], active_streams.stream(0),
|
|
active_streams.gpu_index(0), gpu_memory_allocated);
|
|
}
|
|
lwe_aligned_vec.clear();
|
|
}
|
|
|
|
for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
|
|
cleanup_cuda_keyswitch(active_streams.stream(i),
|
|
active_streams.gpu_index(i), ks_tmp_buf_vec[i],
|
|
gpu_memory_allocated);
|
|
}
|
|
ks_tmp_buf_vec.clear();
|
|
}
|
|
free(h_lut_indexes);
|
|
free(degrees);
|
|
free(max_degrees);
|
|
}
|
|
};
|
|
|
|
template <typename Torus, typename OutputTorus = Torus>
|
|
using int_radix_lut = int_radix_lut_custom_input_output<Torus, Torus>;
|
|
|
|
template <typename InputTorus>
|
|
struct int_noise_squashing_lut
|
|
: int_radix_lut_custom_input_output<InputTorus, __uint128_t> {
|
|
|
|
std::vector<InputTorus *> lwe_aligned_scatter_vec;
|
|
std::vector<__uint128_t *> lwe_aligned_gather_vec;
|
|
// noise squashing constructor
|
|
int_noise_squashing_lut(CudaStreams streams, int_radix_params params,
|
|
uint32_t input_glwe_dimension,
|
|
uint32_t input_polynomial_size,
|
|
uint32_t num_radix_blocks,
|
|
uint32_t original_num_blocks,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker)
|
|
|
|
: int_radix_lut_custom_input_output<InputTorus, __uint128_t>(
|
|
streams, input_glwe_dimension * input_polynomial_size, params, 1,
|
|
num_radix_blocks, original_num_blocks, allocate_gpu_memory,
|
|
size_tracker) {
|
|
|
|
// lut for the squashing
|
|
auto f_squash = [](__uint128_t block) -> __uint128_t { return block; };
|
|
|
|
generate_device_accumulator<__uint128_t>(
|
|
this->active_streams.stream(0), this->active_streams.gpu_index(0),
|
|
this->get_lut(0, 0), this->get_degree(0), this->get_max_degree(0),
|
|
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
|
params.carry_modulus, f_squash, allocate_gpu_memory);
|
|
|
|
this->broadcast_lut(this->active_streams);
|
|
}
|
|
|
|
using int_radix_lut_custom_input_output<InputTorus, __uint128_t>::release;
|
|
};
|
|
|
|
// Forward declarations for operation buffers
|
|
template <typename Torus> struct int_sub_and_propagate;
|
|
|
|
template <typename Torus> struct int_bit_extract_luts_buffer {
|
|
int_radix_params params;
|
|
int_radix_lut<Torus> *lut;
|
|
bool gpu_memory_allocated;
|
|
|
|
// With offset
|
|
int_bit_extract_luts_buffer(CudaStreams streams, int_radix_params params,
|
|
uint32_t bits_per_block, uint32_t final_offset,
|
|
uint32_t num_radix_blocks,
|
|
bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
this->params = params;
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
|
|
lut = new int_radix_lut<Torus>(streams, params, bits_per_block,
|
|
bits_per_block * num_radix_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
for (int i = 0; i < bits_per_block; i++) {
|
|
|
|
auto operator_f = [i, final_offset](Torus x) -> Torus {
|
|
Torus y = (x >> i) & 1;
|
|
return y << final_offset;
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
|
|
lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
operator_f, gpu_memory_allocated);
|
|
}
|
|
|
|
/**
|
|
* we have bits_per_blocks LUTs that should be used for all bits in all
|
|
* blocks
|
|
*/
|
|
Torus *h_lut_indexes = lut->h_lut_indexes;
|
|
for (int j = 0; j < num_radix_blocks; j++) {
|
|
for (int i = 0; i < bits_per_block; i++)
|
|
h_lut_indexes[i + j * bits_per_block] = i;
|
|
}
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lut->get_lut_indexes(0, 0), h_lut_indexes,
|
|
num_radix_blocks * bits_per_block * sizeof(Torus), streams.stream(0),
|
|
streams.gpu_index(0), allocate_gpu_memory);
|
|
|
|
auto active_streams = streams.active_gpu_subset(
|
|
bits_per_block * num_radix_blocks, params.pbs_type);
|
|
lut->broadcast_lut(active_streams);
|
|
|
|
/**
|
|
* the input indexes should take the first bits_per_block PBS to target
|
|
* the block 0, then block 1, etc...
|
|
*/
|
|
Torus *h_lwe_indexes_in =
|
|
(Torus *)malloc(num_radix_blocks * bits_per_block * sizeof(Torus));
|
|
|
|
for (int j = 0; j < num_radix_blocks; j++) {
|
|
for (int i = 0; i < bits_per_block; i++)
|
|
h_lwe_indexes_in[i + j * bits_per_block] = j;
|
|
}
|
|
|
|
/**
|
|
* the output should aim different lwe ciphertexts, so lwe_indexes_out =
|
|
* range(num_luts)
|
|
*/
|
|
Torus *h_lwe_indexes_out =
|
|
(Torus *)malloc(num_radix_blocks * bits_per_block * sizeof(Torus));
|
|
|
|
for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
|
|
h_lwe_indexes_out[i] = i;
|
|
|
|
lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
|
|
h_lwe_indexes_in, h_lwe_indexes_out);
|
|
lut->allocate_lwe_vector_for_non_trivial_indexes(
|
|
active_streams, num_radix_blocks * bits_per_block, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
free(h_lwe_indexes_in);
|
|
free(h_lwe_indexes_out);
|
|
}
|
|
|
|
// Without offset
|
|
int_bit_extract_luts_buffer(CudaStreams streams, int_radix_params params,
|
|
uint32_t bits_per_block,
|
|
uint32_t num_radix_blocks,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker)
|
|
: int_bit_extract_luts_buffer(streams, params, bits_per_block, 0,
|
|
num_radix_blocks, allocate_gpu_memory,
|
|
size_tracker) {}
|
|
|
|
void release(CudaStreams streams) {
|
|
lut->release(streams);
|
|
delete (lut);
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_fullprop_buffer {
|
|
int_radix_params params;
|
|
|
|
int_radix_lut<Torus> *lut;
|
|
|
|
CudaRadixCiphertextFFI *tmp_small_lwe_vector;
|
|
CudaRadixCiphertextFFI *tmp_big_lwe_vector;
|
|
bool gpu_memory_allocated;
|
|
|
|
int_fullprop_buffer(CudaStreams streams, int_radix_params params,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
this->params = params;
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
lut = new int_radix_lut<Torus>(streams.get_ith(0), params, 2, 2,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
// LUTs
|
|
auto lut_f_message = [params](Torus x) -> Torus {
|
|
return x % params.message_modulus;
|
|
};
|
|
auto lut_f_carry = [params](Torus x) -> Torus {
|
|
return x / params.message_modulus;
|
|
};
|
|
|
|
//
|
|
Torus *lut_buffer_message = lut->get_lut(0, 0);
|
|
uint64_t *message_degree = lut->get_degree(0);
|
|
uint64_t *message_max_degree = lut->get_max_degree(0);
|
|
Torus *lut_buffer_carry = lut->get_lut(0, 1);
|
|
uint64_t *carry_degree = lut->get_degree(1);
|
|
uint64_t *carry_max_degree = lut->get_max_degree(1);
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), lut_buffer_message,
|
|
message_degree, message_max_degree, params.glwe_dimension,
|
|
params.polynomial_size, params.message_modulus, params.carry_modulus,
|
|
lut_f_message, gpu_memory_allocated);
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
|
|
carry_max_degree, params.glwe_dimension, params.polynomial_size,
|
|
params.message_modulus, params.carry_modulus, lut_f_carry,
|
|
gpu_memory_allocated);
|
|
|
|
uint64_t lwe_indexes_size = 2 * sizeof(Torus);
|
|
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
|
|
for (int i = 0; i < 2; i++)
|
|
h_lwe_indexes[i] = i;
|
|
Torus *lwe_indexes = lut->get_lut_indexes(0, 0);
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
|
|
streams.gpu_index(0), allocate_gpu_memory);
|
|
//
|
|
// No broadcast is needed because full prop is done on 1 single GPU.
|
|
//
|
|
|
|
tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), tmp_small_lwe_vector, 2,
|
|
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
tmp_big_lwe_vector = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), tmp_big_lwe_vector, 2,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
free(h_lwe_indexes);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
tmp_small_lwe_vector, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
tmp_big_lwe_vector, gpu_memory_allocated);
|
|
lut->release(streams.get_ith(0));
|
|
delete tmp_small_lwe_vector;
|
|
delete tmp_big_lwe_vector;
|
|
delete lut;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
|
|
|
int_radix_params params;
|
|
uint32_t max_total_blocks_in_vec;
|
|
uint32_t num_blocks_in_radix;
|
|
uint32_t max_num_radix_in_vec;
|
|
uint32_t chunk_size;
|
|
bool gpu_memory_allocated;
|
|
bool reduce_degrees_for_single_carry_propagation;
|
|
|
|
// temporary buffers
|
|
CudaRadixCiphertextFFI *current_blocks;
|
|
CudaRadixCiphertextFFI *small_lwe_vector;
|
|
|
|
uint32_t *d_columns_data;
|
|
uint32_t *d_columns_counter;
|
|
uint32_t **d_columns;
|
|
|
|
uint32_t *d_new_columns_data;
|
|
uint32_t *d_new_columns_counter;
|
|
uint32_t **d_new_columns;
|
|
|
|
uint64_t *d_degrees;
|
|
|
|
// lookup table for extracting message and carry
|
|
int_radix_lut<Torus> *luts_message_carry;
|
|
|
|
bool mem_reuse = false;
|
|
bool allocated_luts_message_carry;
|
|
|
|
void setup_index_buffers(CudaStreams streams, uint64_t &size_tracker) {
|
|
|
|
d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
|
max_total_blocks_in_vec * sizeof(uint64_t), streams.stream(0),
|
|
streams.gpu_index(0), size_tracker, gpu_memory_allocated);
|
|
|
|
auto num_blocks_in_radix = this->num_blocks_in_radix;
|
|
auto max_num_radix_in_vec = this->max_num_radix_in_vec;
|
|
auto setup_columns = [num_blocks_in_radix, max_num_radix_in_vec, streams](
|
|
uint32_t **&columns, uint32_t *&columns_data,
|
|
uint32_t *&columns_counter, uint64_t &size_tracker,
|
|
bool gpu_memory_allocated) {
|
|
columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
|
num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
|
|
streams.stream(0), streams.gpu_index(0), size_tracker,
|
|
gpu_memory_allocated);
|
|
columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
|
num_blocks_in_radix * sizeof(uint32_t), streams.stream(0),
|
|
streams.gpu_index(0), size_tracker, gpu_memory_allocated);
|
|
cuda_memset_with_size_tracking_async(
|
|
columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
|
|
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
|
|
uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
|
|
for (int i = 0; i < num_blocks_in_radix; ++i) {
|
|
h_columns[i] = columns_data + i * max_num_radix_in_vec;
|
|
}
|
|
columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
|
|
num_blocks_in_radix * sizeof(uint32_t *), streams.stream(0),
|
|
streams.gpu_index(0), size_tracker, gpu_memory_allocated);
|
|
if (gpu_memory_allocated) {
|
|
cuda_memcpy_async_to_gpu(columns, h_columns,
|
|
num_blocks_in_radix * sizeof(uint32_t *),
|
|
streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
delete[] h_columns;
|
|
};
|
|
|
|
setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
|
|
gpu_memory_allocated);
|
|
setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
|
|
size_tracker, gpu_memory_allocated);
|
|
}
|
|
|
|
void setup_lookup_tables(CudaStreams streams, uint32_t num_radix_in_vec,
|
|
const uint64_t *const degrees) {
|
|
uint32_t message_modulus = params.message_modulus;
|
|
bool _needs_processing = false;
|
|
radix_columns current_columns(degrees, num_blocks_in_radix,
|
|
num_radix_in_vec, chunk_size,
|
|
_needs_processing);
|
|
uint32_t total_ciphertexts = 0;
|
|
uint32_t total_messages = 0;
|
|
current_columns.next_accumulation(total_ciphertexts, total_messages,
|
|
_needs_processing);
|
|
uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
|
|
if (!mem_reuse) {
|
|
if (total_ciphertexts > 0 ||
|
|
reduce_degrees_for_single_carry_propagation) {
|
|
uint64_t size_tracker = 0;
|
|
luts_message_carry = new int_radix_lut<Torus>(
|
|
streams, params, 2, pbs_count, true, size_tracker);
|
|
allocated_luts_message_carry = true;
|
|
uint64_t message_modulus_bits =
|
|
(uint64_t)std::log2(params.message_modulus);
|
|
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
|
|
uint64_t total_bits_per_block =
|
|
message_modulus_bits + carry_modulus_bits;
|
|
uint64_t denominator =
|
|
(uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
|
|
(pow(2, message_modulus_bits) - 1));
|
|
|
|
uint64_t upper_bound_num_blocks =
|
|
max_total_blocks_in_vec * 2 / denominator;
|
|
luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
|
|
streams, upper_bound_num_blocks, size_tracker, true);
|
|
}
|
|
}
|
|
if (allocated_luts_message_carry) {
|
|
auto message_acc = luts_message_carry->get_lut(0, 0);
|
|
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
|
|
|
// define functions for each accumulator
|
|
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
|
return x % message_modulus;
|
|
};
|
|
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
|
return x / message_modulus;
|
|
};
|
|
|
|
// generate accumulators
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), message_acc,
|
|
luts_message_carry->get_degree(0),
|
|
luts_message_carry->get_max_degree(0), params.glwe_dimension,
|
|
params.polynomial_size, message_modulus, params.carry_modulus,
|
|
lut_f_message, gpu_memory_allocated);
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), carry_acc,
|
|
luts_message_carry->get_degree(1),
|
|
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
|
params.polynomial_size, message_modulus, params.carry_modulus,
|
|
lut_f_carry, gpu_memory_allocated);
|
|
auto active_gpu_count_mc =
|
|
streams.active_gpu_subset(pbs_count, params.pbs_type);
|
|
luts_message_carry->broadcast_lut(active_gpu_count_mc);
|
|
}
|
|
}
|
|
int_sum_ciphertexts_vec_memory(
|
|
CudaStreams streams, int_radix_params params,
|
|
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
|
bool reduce_degrees_for_single_carry_propagation,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
this->params = params;
|
|
this->mem_reuse = false;
|
|
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
|
this->num_blocks_in_radix = num_blocks_in_radix;
|
|
this->max_num_radix_in_vec = max_num_radix_in_vec;
|
|
this->gpu_memory_allocated = allocate_gpu_memory;
|
|
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
|
|
(params.message_modulus - 1);
|
|
this->allocated_luts_message_carry = false;
|
|
this->reduce_degrees_for_single_carry_propagation =
|
|
reduce_degrees_for_single_carry_propagation;
|
|
|
|
setup_index_buffers(streams, size_tracker);
|
|
// because we setup_lut in host function for sum_ciphertexts to save memory
|
|
// the size_tracker is topped up here to have a max bound on the used memory
|
|
uint32_t max_pbs_count = std::max(
|
|
2 * (max_total_blocks_in_vec / chunk_size), 2 * num_blocks_in_radix);
|
|
if (max_pbs_count > 0) {
|
|
int_radix_lut<Torus> *luts_message_carry_dry_run =
|
|
new int_radix_lut<Torus>(streams, params, 2, max_pbs_count, false,
|
|
size_tracker);
|
|
luts_message_carry_dry_run->release(streams);
|
|
delete luts_message_carry_dry_run;
|
|
}
|
|
|
|
// create and allocate intermediate buffers
|
|
current_blocks = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), current_blocks,
|
|
max_total_blocks_in_vec, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
small_lwe_vector = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), small_lwe_vector,
|
|
max_total_blocks_in_vec, params.small_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
}
|
|
|
|
int_sum_ciphertexts_vec_memory(
|
|
CudaStreams streams, int_radix_params params,
|
|
uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
|
|
CudaRadixCiphertextFFI *current_blocks,
|
|
CudaRadixCiphertextFFI *small_lwe_vector,
|
|
int_radix_lut<Torus> *reused_lut,
|
|
bool reduce_degrees_for_single_carry_propagation,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
this->mem_reuse = true;
|
|
this->params = params;
|
|
this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
|
this->num_blocks_in_radix = num_blocks_in_radix;
|
|
this->max_num_radix_in_vec = max_num_radix_in_vec;
|
|
this->gpu_memory_allocated = allocate_gpu_memory;
|
|
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
|
|
(params.message_modulus - 1);
|
|
this->allocated_luts_message_carry = true;
|
|
this->reduce_degrees_for_single_carry_propagation =
|
|
reduce_degrees_for_single_carry_propagation;
|
|
|
|
this->current_blocks = current_blocks;
|
|
this->small_lwe_vector = small_lwe_vector;
|
|
this->luts_message_carry = reused_lut;
|
|
|
|
uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus);
|
|
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
|
|
uint64_t total_bits_per_block = message_modulus_bits + carry_modulus_bits;
|
|
uint64_t denominator =
|
|
(uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
|
|
(pow(2, message_modulus_bits) - 1));
|
|
|
|
uint64_t upper_bound_num_blocks = max_total_blocks_in_vec * 2 / denominator;
|
|
this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
|
|
streams, upper_bound_num_blocks, size_tracker, allocate_gpu_memory);
|
|
setup_index_buffers(streams, size_tracker);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
cuda_drop_with_size_tracking_async(d_degrees, streams.stream(0),
|
|
streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(d_columns_data, streams.stream(0),
|
|
streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(d_columns_counter, streams.stream(0),
|
|
streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(d_columns, streams.stream(0),
|
|
streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
|
|
cuda_drop_with_size_tracking_async(d_new_columns_data, streams.stream(0),
|
|
streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(d_new_columns_counter, streams.stream(0),
|
|
streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(d_new_columns, streams.stream(0),
|
|
streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
|
|
if (!mem_reuse) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
current_blocks, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
small_lwe_vector, gpu_memory_allocated);
|
|
if (allocated_luts_message_carry) {
|
|
luts_message_carry->release(streams);
|
|
delete luts_message_carry;
|
|
}
|
|
delete current_blocks;
|
|
delete small_lwe_vector;
|
|
}
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
// For sequential algorithm in group propagation
|
|
template <typename Torus> struct int_seq_group_prop_memory {
|
|
|
|
CudaRadixCiphertextFFI *group_resolved_carries;
|
|
int_radix_lut<Torus> *lut_sequential_algorithm;
|
|
uint32_t grouping_size;
|
|
bool gpu_memory_allocated;
|
|
|
|
int_seq_group_prop_memory(CudaStreams streams, int_radix_params params,
|
|
uint32_t group_size, uint32_t big_lwe_size_bytes,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
grouping_size = group_size;
|
|
group_resolved_carries = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), group_resolved_carries,
|
|
grouping_size, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
int num_seq_luts = grouping_size - 1;
|
|
Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
|
|
lut_sequential_algorithm =
|
|
new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
|
|
allocate_gpu_memory, size_tracker);
|
|
for (int index = 0; index < num_seq_luts; index++) {
|
|
auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
|
|
return (propa_cum_sum_block >> (index + 1)) & 1;
|
|
};
|
|
auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), seq_lut,
|
|
lut_sequential_algorithm->get_degree(index),
|
|
lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
|
|
gpu_memory_allocated);
|
|
h_seq_lut_indexes[index] = index;
|
|
}
|
|
Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
|
|
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
|
auto active_streams =
|
|
streams.active_gpu_subset(num_seq_luts, params.pbs_type);
|
|
lut_sequential_algorithm->broadcast_lut(active_streams);
|
|
free(h_seq_lut_indexes);
|
|
};
|
|
void release(CudaStreams streams) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
group_resolved_carries,
|
|
gpu_memory_allocated);
|
|
lut_sequential_algorithm->release(streams);
|
|
delete group_resolved_carries;
|
|
delete lut_sequential_algorithm;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
};
|
|
};
|
|
|
|
// For hillis steele algorithm in group propagation
|
|
template <typename Torus> struct int_hs_group_prop_memory {
|
|
|
|
int_radix_lut<Torus> *lut_hillis_steele;
|
|
bool gpu_memory_allocated;
|
|
|
|
int_hs_group_prop_memory(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_groups, uint32_t big_lwe_size_bytes,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
|
|
if (msb == 2) {
|
|
return 1; // Remap Generate to 1
|
|
} else if (msb == 3) {
|
|
// MSB propagates
|
|
if (lsb == 2) {
|
|
return 1;
|
|
} else {
|
|
return lsb;
|
|
} // also remap here
|
|
} else {
|
|
return msb;
|
|
}
|
|
};
|
|
|
|
lut_hillis_steele = new int_radix_lut<Torus>(
|
|
streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);
|
|
|
|
generate_device_accumulator_bivariate<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
|
|
lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
|
|
message_modulus, carry_modulus, f_lut_hillis_steele,
|
|
gpu_memory_allocated);
|
|
auto active_streams =
|
|
streams.active_gpu_subset(num_groups, params.pbs_type);
|
|
lut_hillis_steele->broadcast_lut(active_streams);
|
|
};
|
|
void release(CudaStreams streams) {
|
|
|
|
lut_hillis_steele->release(streams);
|
|
delete lut_hillis_steele;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
}
|
|
};
|
|
|
|
// compute_shifted_blocks_and_block_states
|
|
template <typename Torus> struct int_shifted_blocks_and_states_memory {
|
|
CudaRadixCiphertextFFI *shifted_blocks_and_states;
|
|
CudaRadixCiphertextFFI *shifted_blocks;
|
|
CudaRadixCiphertextFFI *block_states;
|
|
|
|
int_radix_lut<Torus> *luts_array_first_step;
|
|
bool gpu_memory_allocated;
|
|
|
|
int_shifted_blocks_and_states_memory(
|
|
CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
|
|
uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
shifted_blocks_and_states = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), shifted_blocks_and_states,
|
|
num_many_lut * num_radix_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
shifted_blocks = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), shifted_blocks,
|
|
num_radix_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
block_states = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), block_states, num_radix_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
|
|
uint32_t num_luts_first_step = 2 * grouping_size + 1;
|
|
|
|
luts_array_first_step = new int_radix_lut<Torus>(
|
|
streams, params, num_luts_first_step, num_radix_blocks, num_many_lut,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
auto f_shift_block = [message_modulus](Torus block) -> Torus {
|
|
return (block % message_modulus) << 1;
|
|
};
|
|
|
|
auto f_first_block_state = [message_modulus](Torus block) -> Torus {
|
|
if (block >= message_modulus)
|
|
return OUTPUT_CARRY::GENERATED;
|
|
else {
|
|
return OUTPUT_CARRY::NONE;
|
|
}
|
|
};
|
|
std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
|
|
f_first_block_state, f_shift_block};
|
|
|
|
auto first_block_lut = luts_array_first_step->get_lut(0, 0);
|
|
auto first_block_lut_degrees = luts_array_first_step->get_degree(0);
|
|
auto first_block_lut_max_degree = luts_array_first_step->get_max_degree(0);
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), first_block_lut,
|
|
first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
|
|
gpu_memory_allocated);
|
|
|
|
// luts for other blocks of the first grouping
|
|
for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
|
|
auto f_state = [message_modulus, lut_id](Torus block) -> Torus {
|
|
uint64_t r = 0;
|
|
if (block >= message_modulus) {
|
|
r = 2; // Generates Carry
|
|
} else if (block == (message_modulus - 1)) {
|
|
r = 1; // Propagates a carry
|
|
} else {
|
|
r = 0; // Does not generate carry
|
|
}
|
|
return r << (lut_id - 1);
|
|
};
|
|
std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
|
|
f_state, f_shift_block};
|
|
auto lut = luts_array_first_step->get_lut(0, lut_id);
|
|
auto lut_degrees = luts_array_first_step->get_degree(lut_id);
|
|
auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
|
|
lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
|
|
carry_modulus, f_grouping_luts, gpu_memory_allocated);
|
|
}
|
|
|
|
// luts for the rest of groupings (except for the last block)
|
|
for (int i = 0; i < grouping_size; i++) {
|
|
uint32_t lut_id = i + grouping_size;
|
|
auto f_state = [message_modulus, i](Torus block) -> Torus {
|
|
uint64_t r = 0;
|
|
if (block >= message_modulus) {
|
|
r = 2; // Generates Carry
|
|
} else if (block == (message_modulus - 1)) {
|
|
r = 1; // Propagates a carry
|
|
} else {
|
|
r = 0; // Does not borrow
|
|
}
|
|
return r << i;
|
|
};
|
|
std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
|
|
f_state, f_shift_block};
|
|
|
|
auto lut = luts_array_first_step->get_lut(0, lut_id);
|
|
auto lut_degrees = luts_array_first_step->get_degree(lut_id);
|
|
auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
|
|
lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
|
|
carry_modulus, f_grouping_luts, gpu_memory_allocated);
|
|
}
|
|
|
|
// For the last block we need to generate a new lut
|
|
auto f_last_block_state = [message_modulus](Torus block) -> Torus {
|
|
if (block >= message_modulus)
|
|
return 2 << 1; // Generates
|
|
else
|
|
return 0; // Nothing
|
|
};
|
|
|
|
uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step
|
|
|
|
auto last_block_lut = luts_array_first_step->get_lut(0, lut_id);
|
|
auto last_block_lut_degrees = luts_array_first_step->get_degree(lut_id);
|
|
auto last_block_lut_max_degree =
|
|
luts_array_first_step->get_max_degree(lut_id);
|
|
|
|
std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
|
|
f_last_block_state, f_shift_block};
|
|
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), last_block_lut,
|
|
last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
|
|
gpu_memory_allocated);
|
|
|
|
// Generate the indexes to switch between luts within the pbs
|
|
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
|
|
|
|
Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;
|
|
for (int index = 0; index < num_radix_blocks; index++) {
|
|
uint32_t grouping_index = index / grouping_size;
|
|
bool is_in_first_grouping = (grouping_index == 0);
|
|
uint32_t index_in_grouping = index % grouping_size;
|
|
bool is_last_index = (index == (num_radix_blocks - 1));
|
|
if (is_last_index) {
|
|
if (num_radix_blocks == 1) {
|
|
h_lut_indexes[index] = 2 * grouping_size;
|
|
} else {
|
|
h_lut_indexes[index] = 2;
|
|
}
|
|
} else if (is_in_first_grouping) {
|
|
h_lut_indexes[index] = index_in_grouping;
|
|
} else {
|
|
h_lut_indexes[index] = index_in_grouping + grouping_size;
|
|
}
|
|
}
|
|
|
|
// copy the indexes to the gpu
|
|
Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
|
|
streams.gpu_index(0), allocate_gpu_memory);
|
|
// Do I need to do something else for the multi-gpu?
|
|
auto active_streams =
|
|
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
|
luts_array_first_step->broadcast_lut(active_streams);
|
|
};
|
|
void release(CudaStreams streams) {
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
shifted_blocks_and_states,
|
|
gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
shifted_blocks, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
block_states, gpu_memory_allocated);
|
|
|
|
luts_array_first_step->release(streams);
|
|
delete luts_array_first_step;
|
|
delete shifted_blocks_and_states;
|
|
delete shifted_blocks;
|
|
delete block_states;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
};
|
|
};
|
|
|
|
// compute_propagation simulator and group carries
|
|
template <typename Torus> struct int_prop_simu_group_carries_memory {
|
|
CudaRadixCiphertextFFI *propagation_cum_sums;
|
|
CudaRadixCiphertextFFI *simulators;
|
|
CudaRadixCiphertextFFI *prepared_blocks;
|
|
CudaRadixCiphertextFFI *grouping_pgns;
|
|
CudaRadixCiphertextFFI *resolved_carries;
|
|
|
|
Torus *scalar_array_cum_sum;
|
|
Torus *h_scalar_array_cum_sum;
|
|
|
|
int_radix_lut<Torus> *luts_array_second_step;
|
|
|
|
int_seq_group_prop_memory<Torus> *seq_group_prop_mem;
|
|
int_hs_group_prop_memory<Torus> *hs_group_prop_mem;
|
|
|
|
uint32_t group_size;
|
|
bool use_sequential_algorithm_to_resolve_group_carries;
|
|
bool gpu_memory_allocated;
|
|
|
|
int_prop_simu_group_carries_memory(
|
|
CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
|
|
uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
|
|
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
|
|
|
|
uint32_t block_modulus = message_modulus * carry_modulus;
|
|
uint32_t num_bits_in_block = std::log2(block_modulus);
|
|
|
|
group_size = grouping_size;
|
|
|
|
propagation_cum_sums = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), propagation_cum_sums,
|
|
num_radix_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
simulators = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), simulators, num_radix_blocks,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
prepared_blocks = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), prepared_blocks,
|
|
num_radix_blocks + 1, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
resolved_carries = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), resolved_carries,
|
|
num_groups + 1, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
grouping_pgns = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), grouping_pgns, num_groups,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
scalar_array_cum_sum = (Torus *)cuda_malloc_with_size_tracking_async(
|
|
num_radix_blocks * sizeof(Torus), streams.stream(0),
|
|
streams.gpu_index(0), size_tracker, allocate_gpu_memory);
|
|
cuda_memset_with_size_tracking_async(
|
|
scalar_array_cum_sum, 0, num_radix_blocks * sizeof(Torus),
|
|
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
|
h_scalar_array_cum_sum = new Torus[num_radix_blocks]();
|
|
|
|
// create lut objects for step 2
|
|
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
|
|
uint32_t num_carry_to_resolve = num_groups - 1;
|
|
uint32_t saturated_sub =
|
|
((num_carry_to_resolve > 1) ? num_carry_to_resolve - 1 : 0);
|
|
uint32_t sequential_depth = saturated_sub / (grouping_size - 1);
|
|
uint32_t hillis_steel_depth;
|
|
|
|
if (num_carry_to_resolve == 0) {
|
|
hillis_steel_depth = 0;
|
|
} else {
|
|
hillis_steel_depth = std::ceil(std::log2(num_carry_to_resolve));
|
|
}
|
|
|
|
use_sequential_algorithm_to_resolve_group_carries =
|
|
sequential_depth <= hillis_steel_depth;
|
|
uint32_t num_extra_luts = 0;
|
|
if (use_sequential_algorithm_to_resolve_group_carries) {
|
|
num_extra_luts = (grouping_size - 1);
|
|
} else {
|
|
num_extra_luts = 1;
|
|
}
|
|
|
|
uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
|
|
luts_array_second_step = new int_radix_lut<Torus>(
|
|
streams, params, num_luts_second_step, num_radix_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
// luts for first group inner propagation
|
|
for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
|
|
auto f_first_grouping_inner_propagation =
|
|
[lut_id](Torus propa_cum_sum_block) -> Torus {
|
|
uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;
|
|
|
|
if (carry != 0) {
|
|
return 2ull; // Generates Carry
|
|
} else {
|
|
return 0ull; // Does not generate carry
|
|
}
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
luts_array_second_step->get_lut(0, lut_id),
|
|
luts_array_second_step->get_degree(lut_id),
|
|
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus,
|
|
f_first_grouping_inner_propagation, gpu_memory_allocated);
|
|
}
|
|
|
|
auto f_first_grouping_outer_propagation =
|
|
[num_bits_in_block](Torus block) -> Torus {
|
|
return (block >> (num_bits_in_block - 1)) & 1;
|
|
};
|
|
|
|
int lut_id = grouping_size - 1;
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
luts_array_second_step->get_lut(0, lut_id),
|
|
luts_array_second_step->get_degree(lut_id),
|
|
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus,
|
|
f_first_grouping_outer_propagation, gpu_memory_allocated);
|
|
|
|
// for other groupings inner propagation
|
|
for (int index = 0; index < grouping_size; index++) {
|
|
uint32_t lut_id = index + grouping_size;
|
|
|
|
auto f_other_groupings_inner_propagation =
|
|
[index](Torus propa_cum_sum_block) -> Torus {
|
|
uint64_t mask = (2 << index) - 1;
|
|
if (propa_cum_sum_block >= (2 << index)) {
|
|
return 2ull; // Generates
|
|
} else if ((propa_cum_sum_block & mask) == mask) {
|
|
return 1ull; // Propagate
|
|
} else {
|
|
return 0ull; // Nothing
|
|
}
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
luts_array_second_step->get_lut(0, lut_id),
|
|
luts_array_second_step->get_degree(lut_id),
|
|
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus,
|
|
f_other_groupings_inner_propagation, gpu_memory_allocated);
|
|
}
|
|
|
|
if (use_sequential_algorithm_to_resolve_group_carries) {
|
|
for (int index = 0; index < grouping_size - 1; index++) {
|
|
uint32_t lut_id = index + 2 * grouping_size;
|
|
|
|
auto f_group_propagation = [index, block_modulus,
|
|
num_bits_in_block](Torus block) -> Torus {
|
|
if (block == (block_modulus - 1)) {
|
|
return 0ull;
|
|
} else {
|
|
return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
|
|
}
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
luts_array_second_step->get_lut(0, lut_id),
|
|
luts_array_second_step->get_degree(lut_id),
|
|
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus,
|
|
f_group_propagation, gpu_memory_allocated);
|
|
}
|
|
} else {
|
|
uint32_t lut_id = 2 * grouping_size;
|
|
auto f_group_propagation = [block_modulus](Torus block) {
|
|
if (block == (block_modulus - 1)) {
|
|
return 2ull;
|
|
} else {
|
|
return UINT64_MAX % (block_modulus * 2ull);
|
|
}
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
luts_array_second_step->get_lut(0, lut_id),
|
|
luts_array_second_step->get_degree(lut_id),
|
|
luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_group_propagation,
|
|
gpu_memory_allocated);
|
|
}
|
|
|
|
Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);
|
|
|
|
for (int index = 0; index < num_radix_blocks; index++) {
|
|
uint32_t grouping_index = index / grouping_size;
|
|
bool is_in_first_grouping = (grouping_index == 0);
|
|
uint32_t index_in_grouping = index % grouping_size;
|
|
|
|
if (is_in_first_grouping) {
|
|
h_second_lut_indexes[index] = index_in_grouping;
|
|
} else if (index_in_grouping == (grouping_size - 1)) {
|
|
if (use_sequential_algorithm_to_resolve_group_carries) {
|
|
int inner_index = (grouping_index - 1) % (grouping_size - 1);
|
|
h_second_lut_indexes[index] = inner_index + 2 * grouping_size;
|
|
} else {
|
|
h_second_lut_indexes[index] = 2 * grouping_size;
|
|
}
|
|
} else {
|
|
h_second_lut_indexes[index] = index_in_grouping + grouping_size;
|
|
}
|
|
|
|
bool may_have_its_padding_bit_set =
|
|
!is_in_first_grouping && (index_in_grouping == grouping_size - 1);
|
|
|
|
if (may_have_its_padding_bit_set) {
|
|
if (use_sequential_algorithm_to_resolve_group_carries) {
|
|
h_scalar_array_cum_sum[index] =
|
|
1 << ((grouping_index - 1) % (grouping_size - 1));
|
|
} else {
|
|
h_scalar_array_cum_sum[index] = 1;
|
|
}
|
|
} else {
|
|
h_scalar_array_cum_sum[index] = 0;
|
|
}
|
|
}
|
|
|
|
// copy the indexes to the gpu
|
|
Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
second_lut_indexes, h_second_lut_indexes, lut_indexes_size,
|
|
streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
|
|
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
scalar_array_cum_sum, h_scalar_array_cum_sum,
|
|
num_radix_blocks * sizeof(Torus), streams.stream(0),
|
|
streams.gpu_index(0), allocate_gpu_memory);
|
|
auto active_streams =
|
|
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
|
luts_array_second_step->broadcast_lut(active_streams);
|
|
|
|
if (use_sequential_algorithm_to_resolve_group_carries) {
|
|
|
|
seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
|
|
streams, params, grouping_size, big_lwe_size_bytes,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
} else {
|
|
hs_group_prop_mem = new int_hs_group_prop_memory<Torus>(
|
|
streams, params, num_groups, big_lwe_size_bytes, allocate_gpu_memory,
|
|
size_tracker);
|
|
}
|
|
|
|
free(h_second_lut_indexes);
|
|
};
|
|
|
|
// needed for the division to update the lut indexes
|
|
void update_lut_indexes(CudaStreams streams, Torus *new_lut_indexes,
|
|
Torus *new_scalars, uint32_t new_num_blocks) {
|
|
Torus *lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
|
|
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
|
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
|
|
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
|
|
auto new_active_streams = streams.active_gpu_subset(
|
|
new_num_blocks, luts_array_second_step->params.pbs_type);
|
|
// We just need to update the lut indexes so we use false here
|
|
luts_array_second_step->broadcast_lut(new_active_streams, false);
|
|
|
|
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
|
scalar_array_cum_sum, new_scalars, new_num_blocks * sizeof(Torus),
|
|
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
|
|
}
|
|
|
|
void release(CudaStreams streams) {
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
propagation_cum_sums, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
simulators, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
grouping_pgns, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
prepared_blocks, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
resolved_carries, gpu_memory_allocated);
|
|
cuda_drop_with_size_tracking_async(scalar_array_cum_sum, streams.stream(0),
|
|
streams.gpu_index(0),
|
|
gpu_memory_allocated);
|
|
luts_array_second_step->release(streams);
|
|
|
|
if (use_sequential_algorithm_to_resolve_group_carries) {
|
|
seq_group_prop_mem->release(streams);
|
|
delete seq_group_prop_mem;
|
|
} else {
|
|
hs_group_prop_mem->release(streams);
|
|
delete hs_group_prop_mem;
|
|
}
|
|
|
|
delete propagation_cum_sums;
|
|
delete simulators;
|
|
delete grouping_pgns;
|
|
delete prepared_blocks;
|
|
delete resolved_carries;
|
|
delete luts_array_second_step;
|
|
delete[] h_scalar_array_cum_sum;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
};
|
|
};
|
|
|
|
template <typename Torus> struct int_sc_prop_memory {
|
|
uint32_t num_many_lut;
|
|
uint32_t lut_stride;
|
|
|
|
uint32_t num_groups;
|
|
CudaRadixCiphertextFFI *output_flag;
|
|
CudaRadixCiphertextFFI *last_lhs;
|
|
CudaRadixCiphertextFFI *last_rhs;
|
|
int_radix_lut<Torus> *lut_message_extract;
|
|
|
|
int_radix_lut<Torus> *lut_overflow_flag_prep;
|
|
|
|
int_shifted_blocks_and_states_memory<Torus> *shifted_blocks_state_mem;
|
|
int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
|
|
|
|
int_radix_params params;
|
|
uint32_t requested_flag;
|
|
bool gpu_memory_allocated;
|
|
|
|
int_sc_prop_memory(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_radix_blocks, uint32_t requested_flag_in,
|
|
bool allocate_gpu_memory, uint64_t &size_tracker) {
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
this->params = params;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
requested_flag = requested_flag_in;
|
|
// for compute shifted blocks and block states
|
|
uint32_t block_modulus = message_modulus * carry_modulus;
|
|
uint32_t num_bits_in_block = std::log2(block_modulus);
|
|
uint32_t grouping_size = num_bits_in_block;
|
|
num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
|
|
|
|
num_many_lut = 2; // many luts apply 2 luts
|
|
uint32_t box_size = polynomial_size / block_modulus;
|
|
lut_stride = (block_modulus / num_many_lut) * box_size;
|
|
|
|
shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory<Torus>(
|
|
streams, params, num_radix_blocks, num_many_lut, grouping_size,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
|
|
streams, params, num_radix_blocks, grouping_size, num_groups,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
// Step 3 elements
|
|
int num_luts_message_extract =
|
|
requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
|
|
lut_message_extract = new int_radix_lut<Torus>(
|
|
streams, params, num_luts_message_extract, num_radix_blocks + 1,
|
|
allocate_gpu_memory, size_tracker);
|
|
// lut for the first block in the first grouping
|
|
auto f_message_extract = [message_modulus](Torus block) -> Torus {
|
|
return (block >> 1) % message_modulus;
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
|
|
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
|
message_modulus, carry_modulus, f_message_extract,
|
|
gpu_memory_allocated);
|
|
|
|
// This store a single block that with be used to store the overflow or
|
|
// carry results
|
|
output_flag = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), output_flag,
|
|
num_radix_blocks + 1, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
if (requested_flag == outputFlag::FLAG_OVERFLOW) {
|
|
last_lhs = new CudaRadixCiphertextFFI;
|
|
last_rhs = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), last_lhs, 1,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), last_rhs, 1,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
|
|
// For step 1 overflow should be enable only if flag overflow
|
|
uint32_t num_bits_in_message = std::log2(message_modulus);
|
|
lut_overflow_flag_prep = new int_radix_lut<Torus>(
|
|
streams, params, 1, 1, allocate_gpu_memory, size_tracker);
|
|
|
|
auto f_overflow_fp = [num_bits_in_message](Torus lhs,
|
|
Torus rhs) -> Torus {
|
|
Torus mask = (1 << (num_bits_in_message - 1)) - 1;
|
|
Torus lhs_except_last_bit = lhs & mask;
|
|
Torus rhs_except_last_bit = rhs & mask;
|
|
Torus input_carry1 = 1;
|
|
Torus input_carry2 = 0;
|
|
|
|
Torus output_carry1 =
|
|
((lhs + rhs + input_carry1) >> num_bits_in_message) & 1;
|
|
Torus output_carry2 =
|
|
((lhs + rhs + input_carry2) >> num_bits_in_message) & 1;
|
|
Torus input_carry_last_bit1 =
|
|
((lhs_except_last_bit + rhs_except_last_bit + input_carry1) >>
|
|
(num_bits_in_message - 1)) &
|
|
1;
|
|
Torus input_carry_last_bit2 =
|
|
((lhs_except_last_bit + rhs_except_last_bit + input_carry2) >>
|
|
(num_bits_in_message - 1)) &
|
|
1;
|
|
|
|
Torus output1 = (Torus)(input_carry_last_bit1 != output_carry1);
|
|
Torus output2 = (Torus)(input_carry_last_bit2 != output_carry2);
|
|
|
|
return output1 << 3 | output2 << 2;
|
|
};
|
|
|
|
generate_device_accumulator_bivariate<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
lut_overflow_flag_prep->get_lut(0, 0),
|
|
lut_overflow_flag_prep->get_degree(0),
|
|
lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
|
|
gpu_memory_allocated);
|
|
|
|
auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
|
|
lut_overflow_flag_prep->broadcast_lut(active_streams);
|
|
}
|
|
|
|
// For the final cleanup in case of overflow or carry (it seems that I can)
|
|
// It seems that this lut could be apply together with the other one but for
|
|
// now we won't do it
|
|
if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
|
|
auto f_overflow_last = [num_radix_blocks,
|
|
requested_flag_in](Torus block) -> Torus {
|
|
uint32_t position = (num_radix_blocks == 1 &&
|
|
requested_flag_in == outputFlag::FLAG_OVERFLOW)
|
|
? 0
|
|
: 1;
|
|
Torus input_carry = (block >> position) & 1;
|
|
Torus does_overflow_if_carry_is_1 = (block >> 3) & 1;
|
|
Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
|
|
if (input_carry == outputFlag::FLAG_OVERFLOW) {
|
|
return does_overflow_if_carry_is_1;
|
|
} else {
|
|
return does_overflow_if_carry_is_0;
|
|
}
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
lut_message_extract->get_lut(0, 1),
|
|
lut_message_extract->get_degree(1),
|
|
lut_message_extract->get_max_degree(1), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_overflow_last,
|
|
gpu_memory_allocated);
|
|
|
|
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
|
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
|
if (index < num_radix_blocks) {
|
|
h_lut_indexes[index] = 0;
|
|
} else {
|
|
h_lut_indexes[index] = 1;
|
|
}
|
|
}
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
|
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
|
streams.gpu_index(0), allocate_gpu_memory);
|
|
}
|
|
if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case
|
|
|
|
auto f_carry_last = [](Torus block) -> Torus {
|
|
return ((block >> 2) & 1);
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
lut_message_extract->get_lut(0, 1),
|
|
lut_message_extract->get_degree(1),
|
|
lut_message_extract->get_max_degree(1), glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_carry_last,
|
|
gpu_memory_allocated);
|
|
|
|
Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
|
|
for (int index = 0; index < num_radix_blocks + 1; index++) {
|
|
if (index < num_radix_blocks) {
|
|
h_lut_indexes[index] = 0;
|
|
} else {
|
|
h_lut_indexes[index] = 1;
|
|
}
|
|
}
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
|
|
(num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
|
|
streams.gpu_index(0), allocate_gpu_memory);
|
|
}
|
|
auto active_streams =
|
|
streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
|
|
lut_message_extract->broadcast_lut(active_streams);
|
|
};
|
|
|
|
void release(CudaStreams streams) {
|
|
|
|
shifted_blocks_state_mem->release(streams);
|
|
prop_simu_group_carries_mem->release(streams);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
output_flag, gpu_memory_allocated);
|
|
lut_message_extract->release(streams);
|
|
delete shifted_blocks_state_mem;
|
|
delete prop_simu_group_carries_mem;
|
|
delete output_flag;
|
|
delete lut_message_extract;
|
|
|
|
if (requested_flag == outputFlag::FLAG_OVERFLOW) { // In case of overflow
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
last_lhs, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
last_rhs, gpu_memory_allocated);
|
|
lut_overflow_flag_prep->release(streams);
|
|
delete lut_overflow_flag_prep;
|
|
delete last_lhs;
|
|
delete last_rhs;
|
|
}
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
};
|
|
};
|
|
|
|
template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
|
|
CudaRadixCiphertextFFI *shifted_blocks_and_borrow_states;
|
|
CudaRadixCiphertextFFI *shifted_blocks;
|
|
CudaRadixCiphertextFFI *borrow_states;
|
|
|
|
int_radix_lut<Torus> *luts_array_first_step;
|
|
bool gpu_memory_allocated;
|
|
|
|
int_shifted_blocks_and_borrow_states_memory(
|
|
CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
|
|
uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
|
|
shifted_blocks_and_borrow_states = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
shifted_blocks_and_borrow_states, num_radix_blocks * num_many_lut,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
shifted_blocks = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), shifted_blocks,
|
|
num_radix_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
borrow_states = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), borrow_states,
|
|
num_radix_blocks, params.big_lwe_dimension, size_tracker,
|
|
allocate_gpu_memory);
|
|
|
|
uint32_t num_luts_first_step = 2 * grouping_size + 1;
|
|
|
|
luts_array_first_step = new int_radix_lut<Torus>(
|
|
streams, params, num_luts_first_step, num_radix_blocks, num_many_lut,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
auto f_shift_block = [message_modulus](Torus block) -> Torus {
|
|
uint64_t overflow_guard = message_modulus;
|
|
uint64_t block_mod = block % message_modulus;
|
|
return (overflow_guard | block_mod) << 1;
|
|
};
|
|
|
|
auto f_first_block_state = [message_modulus](Torus block) -> Torus {
|
|
if (block < message_modulus)
|
|
return 1; // Borrows
|
|
else {
|
|
return 0; // Nothing
|
|
}
|
|
};
|
|
std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
|
|
f_first_block_state, f_shift_block};
|
|
|
|
auto first_block_lut = luts_array_first_step->get_lut(0, 0);
|
|
auto first_block_lut_degrees = luts_array_first_step->get_degree(0);
|
|
auto first_block_lut_max_degree = luts_array_first_step->get_max_degree(0);
|
|
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), first_block_lut,
|
|
first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
|
|
gpu_memory_allocated);
|
|
|
|
// luts for other blocks of the first grouping
|
|
for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
|
|
auto f_state = [message_modulus, lut_id](Torus block) -> Torus {
|
|
uint64_t r = 0;
|
|
if (block < message_modulus) {
|
|
r = 2; // Borrows
|
|
} else if (block == message_modulus) {
|
|
r = 1; // Propagates a borrow
|
|
} else {
|
|
r = 0; // Does not borrow
|
|
}
|
|
return r << (lut_id - 1);
|
|
};
|
|
std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
|
|
f_state, f_shift_block};
|
|
auto lut = luts_array_first_step->get_lut(0, lut_id);
|
|
auto lut_degrees = luts_array_first_step->get_degree(lut_id);
|
|
auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
|
|
lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
|
|
carry_modulus, f_grouping_luts, gpu_memory_allocated);
|
|
}
|
|
|
|
// luts for the rest of groupings (except for the last block)
|
|
for (int i = 0; i < grouping_size; i++) {
|
|
uint32_t lut_id = i + grouping_size;
|
|
auto f_state = [message_modulus, i](Torus block) -> Torus {
|
|
uint64_t r = 0;
|
|
if (block < message_modulus) {
|
|
r = 2; // Generates borrow
|
|
} else if (block == message_modulus) {
|
|
r = 1; // Propagates a borrow
|
|
} else {
|
|
r = 0; // Does not borrow
|
|
}
|
|
return r << i;
|
|
};
|
|
std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
|
|
f_state, f_shift_block};
|
|
|
|
auto lut = luts_array_first_step->get_lut(0, lut_id);
|
|
auto lut_degrees = luts_array_first_step->get_degree(lut_id);
|
|
auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
|
|
lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
|
|
carry_modulus, f_grouping_luts, gpu_memory_allocated);
|
|
}
|
|
|
|
auto f_last_block_state = [message_modulus](Torus block) -> Torus {
|
|
if (block < message_modulus)
|
|
return 2 << 1; // Generates a borrow
|
|
else
|
|
return 0; // Nothing
|
|
};
|
|
|
|
uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step
|
|
|
|
auto last_block_lut = luts_array_first_step->get_lut(0, lut_id);
|
|
auto last_block_lut_degrees = luts_array_first_step->get_degree(lut_id);
|
|
auto last_block_lut_max_degree =
|
|
luts_array_first_step->get_max_degree(lut_id);
|
|
|
|
std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
|
|
f_last_block_state, f_shift_block};
|
|
|
|
generate_many_lut_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), last_block_lut,
|
|
last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
|
|
polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
|
|
gpu_memory_allocated);
|
|
|
|
// Generate the indexes to switch between luts within the pbs
|
|
uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
|
|
Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;
|
|
|
|
for (int index = 0; index < num_radix_blocks; index++) {
|
|
uint32_t grouping_index = index / grouping_size;
|
|
bool is_in_first_grouping = (grouping_index == 0);
|
|
uint32_t index_in_grouping = index % grouping_size;
|
|
bool is_last_index = (index == (num_radix_blocks - 1));
|
|
if (is_last_index) {
|
|
if (num_radix_blocks == 1) {
|
|
h_lut_indexes[index] = 2 * grouping_size;
|
|
} else {
|
|
h_lut_indexes[index] = 2;
|
|
}
|
|
} else if (is_in_first_grouping) {
|
|
h_lut_indexes[index] = index_in_grouping;
|
|
} else {
|
|
h_lut_indexes[index] = index_in_grouping + grouping_size;
|
|
}
|
|
}
|
|
// copy the indexes to the gpu
|
|
Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
|
|
cuda_memcpy_with_size_tracking_async_to_gpu(
|
|
lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
|
|
streams.gpu_index(0), allocate_gpu_memory);
|
|
// Do I need to do something else for the multi-gpu?
|
|
auto active_streams =
|
|
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
|
luts_array_first_step->broadcast_lut(active_streams);
|
|
};
|
|
|
|
// needed for the division to update the lut indexes
|
|
void update_lut_indexes(CudaStreams streams, Torus *new_lut_indexes,
|
|
uint32_t new_num_blocks) {
|
|
Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
|
|
cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
|
|
lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
|
|
streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
|
|
auto new_active_streams = streams.active_gpu_subset(
|
|
new_num_blocks, luts_array_first_step->params.pbs_type);
|
|
// We just need to update the lut indexes so we use false here
|
|
luts_array_first_step->broadcast_lut(new_active_streams, false);
|
|
}
|
|
void release(CudaStreams streams) {
|
|
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
shifted_blocks_and_borrow_states,
|
|
gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
shifted_blocks, gpu_memory_allocated);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
borrow_states, gpu_memory_allocated);
|
|
|
|
luts_array_first_step->release(streams);
|
|
delete luts_array_first_step;
|
|
delete shifted_blocks_and_borrow_states;
|
|
delete shifted_blocks;
|
|
delete borrow_states;
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
};
|
|
};
|
|
|
|
template <typename Torus> struct int_borrow_prop_memory {
|
|
uint32_t num_many_lut;
|
|
uint32_t lut_stride;
|
|
|
|
uint32_t group_size;
|
|
uint32_t num_groups;
|
|
CudaRadixCiphertextFFI *overflow_block;
|
|
|
|
int_radix_lut<Torus> *lut_message_extract;
|
|
int_radix_lut<Torus> *lut_borrow_flag;
|
|
|
|
int_shifted_blocks_and_borrow_states_memory<Torus>
|
|
*shifted_blocks_borrow_state_mem;
|
|
int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;
|
|
|
|
int_radix_params params;
|
|
|
|
CudaStreams active_streams;
|
|
InternalCudaStreams internal_streams;
|
|
|
|
uint32_t compute_overflow;
|
|
bool gpu_memory_allocated;
|
|
int_borrow_prop_memory(CudaStreams streams, int_radix_params params,
|
|
uint32_t num_radix_blocks,
|
|
uint32_t compute_overflow_in, bool allocate_gpu_memory,
|
|
uint64_t &size_tracker) {
|
|
gpu_memory_allocated = allocate_gpu_memory;
|
|
this->params = params;
|
|
auto glwe_dimension = params.glwe_dimension;
|
|
auto polynomial_size = params.polynomial_size;
|
|
auto message_modulus = params.message_modulus;
|
|
auto carry_modulus = params.carry_modulus;
|
|
compute_overflow = compute_overflow_in;
|
|
// for compute shifted blocks and block states
|
|
uint32_t block_modulus = message_modulus * carry_modulus;
|
|
uint32_t num_bits_in_block = std::log2(block_modulus);
|
|
uint32_t grouping_size = num_bits_in_block;
|
|
group_size = grouping_size;
|
|
num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;
|
|
|
|
num_many_lut = 2; // many luts apply 2 luts
|
|
uint32_t box_size = polynomial_size / block_modulus;
|
|
lut_stride = (block_modulus / num_many_lut) * box_size;
|
|
|
|
shifted_blocks_borrow_state_mem =
|
|
new int_shifted_blocks_and_borrow_states_memory<Torus>(
|
|
streams, params, num_radix_blocks, num_many_lut, grouping_size,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
|
|
streams, params, num_radix_blocks, grouping_size, num_groups,
|
|
allocate_gpu_memory, size_tracker);
|
|
|
|
overflow_block = new CudaRadixCiphertextFFI;
|
|
create_zero_radix_ciphertext_async<Torus>(
|
|
streams.stream(0), streams.gpu_index(0), overflow_block, 1,
|
|
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
|
|
|
lut_message_extract =
|
|
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
// lut for the first block in the first grouping
|
|
auto f_message_extract = [message_modulus](Torus block) -> Torus {
|
|
return (block >> 1) % message_modulus;
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
|
|
lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
|
|
message_modulus, carry_modulus, f_message_extract,
|
|
gpu_memory_allocated);
|
|
active_streams =
|
|
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
|
|
|
lut_message_extract->broadcast_lut(active_streams);
|
|
|
|
if (compute_overflow) {
|
|
lut_borrow_flag =
|
|
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
|
|
allocate_gpu_memory, size_tracker);
|
|
// lut for the first block in the first grouping
|
|
auto f_borrow_flag = [](Torus block) -> Torus {
|
|
return ((block >> 2) & 1);
|
|
};
|
|
|
|
generate_device_accumulator<Torus>(
|
|
streams.stream(0), streams.gpu_index(0),
|
|
lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
|
|
lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
|
|
message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
|
|
lut_borrow_flag->broadcast_lut(active_streams);
|
|
}
|
|
|
|
active_streams =
|
|
streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
|
|
internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
|
|
2);
|
|
};
|
|
|
|
// needed for the division to update the lut indexes
|
|
void update_lut_indexes(CudaStreams streams, Torus *first_indexes_for_div,
|
|
Torus *second_indexes_for_div, Torus *scalars_for_div,
|
|
uint32_t new_num_blocks) {
|
|
shifted_blocks_borrow_state_mem->update_lut_indexes(
|
|
streams, first_indexes_for_div, new_num_blocks);
|
|
prop_simu_group_carries_mem->update_lut_indexes(
|
|
streams, second_indexes_for_div, scalars_for_div, new_num_blocks);
|
|
}
|
|
void release(CudaStreams streams) {
|
|
|
|
shifted_blocks_borrow_state_mem->release(streams);
|
|
prop_simu_group_carries_mem->release(streams);
|
|
release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
|
|
overflow_block, gpu_memory_allocated);
|
|
|
|
lut_message_extract->release(streams);
|
|
delete lut_message_extract;
|
|
delete overflow_block;
|
|
if (compute_overflow) {
|
|
lut_borrow_flag->release(streams);
|
|
delete lut_borrow_flag;
|
|
}
|
|
|
|
internal_streams.release(streams);
|
|
|
|
cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
|
|
};
|
|
};
|
|
std::pair<bool, bool> get_invert_flags(COMPARISON_TYPE compare);
|
|
void reverseArray(uint64_t arr[], size_t n);
|
|
#endif // CUDA_INTEGER_UTILITIES_H
|