mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-14 09:08:06 -05:00
Compare commits
3 Commits
pa/paralle
...
al/sum_ctx
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
84af5e983a | ||
|
|
069d47309d | ||
|
|
7389494c45 |
@@ -1116,14 +1116,32 @@ template <typename Torus> struct int_overflowing_sub_memory {
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
CudaRadixCiphertextFFI *new_blocks;
|
||||
CudaRadixCiphertextFFI *new_blocks_copy;
|
||||
CudaRadixCiphertextFFI *old_blocks;
|
||||
CudaRadixCiphertextFFI *small_lwe_vector;
|
||||
int_radix_params params;
|
||||
|
||||
int32_t *d_smart_copy_in;
|
||||
int32_t *d_smart_copy_out;
|
||||
int_radix_params params;
|
||||
uint32_t active_gpu_count;
|
||||
size_t chunk_size;
|
||||
size_t max_pbs_count;
|
||||
|
||||
// temporary buffers
|
||||
CudaRadixCiphertextFFI *current_blocks;
|
||||
CudaRadixCiphertextFFI *small_lwe_vector;
|
||||
|
||||
uint32_t *d_columns_data;
|
||||
uint32_t *d_columns_counter;
|
||||
uint32_t **d_columns;
|
||||
|
||||
uint32_t *d_new_columns_data;
|
||||
uint32_t *d_new_columns_counter;
|
||||
uint32_t **d_new_columns;
|
||||
|
||||
uint64_t *d_degrees;
|
||||
uint32_t *d_pbs_counters;
|
||||
|
||||
// additional streams
|
||||
cudaStream_t *helper_streams;
|
||||
|
||||
// lookup table for extracting message and carry
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
|
||||
bool mem_reuse = false;
|
||||
bool gpu_memory_allocated;
|
||||
@@ -1137,100 +1155,139 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
|
||||
uint64_t *size_tracker) {
|
||||
this->params = params;
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
|
||||
(params.message_modulus - 1);
|
||||
this->max_pbs_count =
|
||||
num_blocks_in_radix * max_num_radix_in_vec * 2 / chunk_size;
|
||||
this->active_gpu_count = get_active_gpu_count(2 * max_pbs_count, gpu_count);
|
||||
|
||||
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
size_t max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
uint32_t message_modulus = params.message_modulus;
|
||||
printf("max_total_blocks_in_vec: %d\n", max_total_blocks_in_vec);
|
||||
// process streams
|
||||
helper_streams =
|
||||
(cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
|
||||
for (uint j = 0; j < active_gpu_count; j++) {
|
||||
helper_streams[j] = cuda_create_stream(gpu_indexes[j]);
|
||||
}
|
||||
|
||||
// allocate gpu memory for intermediate buffers
|
||||
new_blocks = new CudaRadixCiphertextFFI;
|
||||
current_blocks = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
new_blocks_copy = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
old_blocks = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], old_blocks, max_pbs_count,
|
||||
streams[0], gpu_indexes[0], current_blocks, max_total_blocks_in_vec,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
small_lwe_vector = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, max_pbs_count,
|
||||
streams[0], gpu_indexes[0], small_lwe_vector, max_total_blocks_in_vec,
|
||||
params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_total_blocks_in_vec * sizeof(uint64_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
}
|
||||
|
||||
int_sum_ciphertexts_vec_memory(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, int_radix_params params, uint32_t num_blocks_in_radix,
|
||||
uint32_t max_num_radix_in_vec, CudaRadixCiphertextFFI *new_blocks,
|
||||
CudaRadixCiphertextFFI *old_blocks,
|
||||
CudaRadixCiphertextFFI *small_lwe_vector, bool allocate_gpu_memory,
|
||||
uint64_t *size_tracker) {
|
||||
mem_reuse = true;
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
this->params = params;
|
||||
d_pbs_counters = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
3 * sizeof(uint32_t), streams[0], gpu_indexes[0], size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
int max_pbs_count = num_blocks_in_radix * max_num_radix_in_vec;
|
||||
auto setup_columns = [num_blocks_in_radix, max_num_radix_in_vec, streams,
|
||||
gpu_indexes, size_tracker, allocate_gpu_memory](
|
||||
uint32_t **&columns, uint32_t *&columns_data,
|
||||
uint32_t *&columns_counter) {
|
||||
columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
|
||||
streams[0], gpu_indexes[0], size_tracker, allocate_gpu_memory);
|
||||
columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * sizeof(uint32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
|
||||
// assign gpu memory for intermediate buffers
|
||||
this->new_blocks = new_blocks;
|
||||
this->old_blocks = old_blocks;
|
||||
this->small_lwe_vector = small_lwe_vector;
|
||||
new_blocks_copy = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, max_pbs_count,
|
||||
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
|
||||
for (int i = 0; i < num_blocks_in_radix; ++i) {
|
||||
h_columns[i] = columns_data + i * max_num_radix_in_vec;
|
||||
}
|
||||
columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
|
||||
num_blocks_in_radix * sizeof(uint32_t *), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
columns, h_columns, num_blocks_in_radix * sizeof(uint32_t *),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete[] h_columns;
|
||||
};
|
||||
|
||||
d_smart_copy_in = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
d_smart_copy_out = (int32_t *)cuda_malloc_with_size_tracking_async(
|
||||
max_pbs_count * sizeof(int32_t), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_in, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memset_with_size_tracking_async(
|
||||
d_smart_copy_out, 0, max_pbs_count * sizeof(int32_t), streams[0],
|
||||
gpu_indexes[0], allocate_gpu_memory);
|
||||
setup_columns(d_columns, d_columns_data, d_columns_counter);
|
||||
setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter);
|
||||
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, 2, max_total_blocks_in_vec,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc,
|
||||
luts_message_carry->get_degree(0),
|
||||
luts_message_carry->get_max_degree(0), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_message, allocate_gpu_memory);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc,
|
||||
luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), params.glwe_dimension,
|
||||
params.polynomial_size, message_modulus, params.carry_modulus,
|
||||
lut_f_carry, allocate_gpu_memory);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
}
|
||||
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
cuda_drop_with_size_tracking_async(d_smart_copy_in, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_smart_copy_out, streams[0],
|
||||
cuda_drop_with_size_tracking_async(d_degrees, streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_pbs_counters, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
|
||||
cuda_drop_with_size_tracking_async(d_columns_data, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_columns_counter, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_columns, streams[0], gpu_indexes[0],
|
||||
gpu_memory_allocated);
|
||||
|
||||
cuda_drop_with_size_tracking_async(d_new_columns_data, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_new_columns_counter, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(d_new_columns, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_destroy_stream(helper_streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
free(helper_streams);
|
||||
|
||||
if (!mem_reuse) {
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], old_blocks,
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], current_blocks,
|
||||
gpu_memory_allocated);
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
small_lwe_vector, gpu_memory_allocated);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete new_blocks;
|
||||
delete old_blocks;
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
|
||||
delete luts_message_carry;
|
||||
delete current_blocks;
|
||||
delete small_lwe_vector;
|
||||
}
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0], new_blocks_copy,
|
||||
gpu_memory_allocated);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
delete new_blocks_copy;
|
||||
}
|
||||
};
|
||||
// For sequential algorithm in group propagation
|
||||
@@ -2604,8 +2661,7 @@ template <typename Torus> struct int_mul_memory {
|
||||
// create memory object for sum ciphertexts
|
||||
sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
|
||||
streams, gpu_indexes, gpu_count, params, num_radix_blocks,
|
||||
2 * num_radix_blocks, block_mul_res, vector_result_sb, small_lwe_vector,
|
||||
allocate_gpu_memory, size_tracker);
|
||||
2 * num_radix_blocks, allocate_gpu_memory, size_tracker);
|
||||
uint32_t uses_carry = 0;
|
||||
uint32_t requested_flag = outputFlag::FLAG_NONE;
|
||||
sc_prop_mem = new int_sc_prop_memory<Torus>(
|
||||
|
||||
@@ -212,6 +212,7 @@ uint64_t scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
printf("pbs_type: %d\n", pbs_type);
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
@@ -234,11 +235,6 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
|
||||
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
|
||||
PANIC("Cuda error: input vector length should be a multiple of the "
|
||||
"output's number of radix blocks")
|
||||
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
|
||||
// the general case
|
||||
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
|
||||
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
|
||||
}
|
||||
switch (mem->params.polynomial_size) {
|
||||
case 512:
|
||||
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <omp.h>
|
||||
#include <queue>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -123,6 +124,173 @@ __global__ void tree_add_chunks(Torus *result_blocks, Torus *input_blocks,
|
||||
}
|
||||
}
|
||||
|
||||
__global__ inline void radix_vec_to_columns(
|
||||
uint32_t *const *const columns, uint32_t *const columns_counter,
|
||||
const uint64_t *const degrees, const uint32_t num_radix_blocks,
|
||||
const uint32_t total_blocks_in_vec) {
|
||||
|
||||
const uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (idx >= total_blocks_in_vec)
|
||||
return;
|
||||
|
||||
const uint64_t degree = degrees[idx];
|
||||
if (degree == 0)
|
||||
return;
|
||||
|
||||
const uint32_t column_id = idx % num_radix_blocks;
|
||||
const uint32_t out_idx = atomicAdd(&columns_counter[column_id], 1);
|
||||
columns[column_id][out_idx] = idx;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_new_columns_and_pbs_indexes(
|
||||
uint32_t *const *const new_columns, uint32_t *const new_columns_counter,
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, uint32_t *const pbs_counters,
|
||||
const uint32_t *const *const columns, const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size) {
|
||||
__shared__ uint32_t counter, sharedOr;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
counter = 0;
|
||||
sharedOr = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
const uint32_t base_id = threadIdx.x;
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
uint32_t ct_count = 0;
|
||||
for (uint32_t i = 0; i + chunk_size <= column_len; i += chunk_size) {
|
||||
// those indexes are for message ciphertexts
|
||||
// for message ciphertexts in and out index should be same
|
||||
const uint32_t in_index = columns[base_id][i];
|
||||
new_columns[base_id][ct_count] = in_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = in_index;
|
||||
lut_indexes[pbs_index] = 0;
|
||||
++ct_count;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
uint32_t message_count = counter;
|
||||
|
||||
if (base_id > 0) {
|
||||
const uint32_t prev_base_id = base_id - 1;
|
||||
const uint32_t prev_column_len = columns_counter[prev_base_id];
|
||||
|
||||
for (uint32_t i = 0; i + chunk_size <= prev_column_len; i += chunk_size) {
|
||||
// those indexes are for carry ciphertexts
|
||||
// for carry ciphertexts input is same as for message
|
||||
// output will be placed to next block in the column
|
||||
const uint32_t in_index = columns[prev_base_id][i];
|
||||
const uint32_t out_index = columns[prev_base_id][i + 1];
|
||||
new_columns[base_id][ct_count] = out_index;
|
||||
const uint32_t pbs_index = atomicAdd(&counter, 1);
|
||||
pbs_indexes_in[pbs_index] = in_index;
|
||||
pbs_indexes_out[pbs_index] = out_index;
|
||||
lut_indexes[pbs_index] = 1;
|
||||
++ct_count;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t start_index = column_len - column_len % chunk_size;
|
||||
for (uint32_t i = start_index; i < column_len; ++i) {
|
||||
new_columns[base_id][ct_count] = columns[base_id][i];
|
||||
++ct_count;
|
||||
}
|
||||
|
||||
new_columns_counter[base_id] = ct_count;
|
||||
|
||||
if (ct_count > chunk_size) {
|
||||
atomicOr(&sharedOr, 1);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
pbs_counters[0] = counter;
|
||||
pbs_counters[1] = message_count;
|
||||
pbs_counters[2] = sharedOr;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ inline void prepare_final_pbs_indexes(
|
||||
Torus *const pbs_indexes_in, Torus *const pbs_indexes_out,
|
||||
Torus *const lut_indexes, const uint32_t num_radix_blocks) {
|
||||
int idx = threadIdx.x;
|
||||
pbs_indexes_in[idx] = idx % num_radix_blocks;
|
||||
pbs_indexes_out[idx] = idx + idx / num_radix_blocks;
|
||||
lut_indexes[idx] = idx / num_radix_blocks;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_chunks(Torus *const input_blocks,
|
||||
const uint32_t *const *const columns,
|
||||
const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size,
|
||||
const uint32_t block_size) {
|
||||
|
||||
const uint32_t part_size = blockDim.x;
|
||||
const uint32_t base_id = blockIdx.x;
|
||||
const uint32_t part_id = blockIdx.y;
|
||||
const uint32_t coef_id = part_id * part_size + threadIdx.x;
|
||||
|
||||
if (coef_id >= block_size)
|
||||
return;
|
||||
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
if (column_len >= chunk_size) {
|
||||
const uint32_t num_chunks = column_len / chunk_size;
|
||||
Torus result = 0;
|
||||
|
||||
for (uint32_t chunk_id = 0; chunk_id < num_chunks; ++chunk_id) {
|
||||
const uint32_t first_ct_id = columns[base_id][chunk_id * chunk_size];
|
||||
result = input_blocks[first_ct_id * block_size + coef_id];
|
||||
|
||||
for (uint32_t ct_id = 1; ct_id < chunk_size; ++ct_id) {
|
||||
const uint32_t cur_ct_id =
|
||||
columns[base_id][chunk_id * chunk_size + ct_id];
|
||||
result += input_blocks[cur_ct_id * block_size + coef_id];
|
||||
}
|
||||
|
||||
input_blocks[first_ct_id * block_size + coef_id] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__global__ void calculate_final_chunk_into_radix(
|
||||
Torus *const out_radix, const Torus *const input_blocks,
|
||||
const uint32_t *const *const columns, const uint32_t *const columns_counter,
|
||||
const uint32_t chunk_size, const uint32_t block_size) {
|
||||
|
||||
const uint32_t part_size = blockDim.x;
|
||||
const uint32_t base_id = blockIdx.x;
|
||||
const uint32_t part_id = blockIdx.y;
|
||||
const uint32_t coef_id = part_id * part_size + threadIdx.x;
|
||||
|
||||
if (coef_id >= block_size)
|
||||
return;
|
||||
|
||||
const uint32_t column_len = columns_counter[base_id];
|
||||
|
||||
Torus result = 0;
|
||||
if (column_len) {
|
||||
const uint32_t first_ct_id = columns[base_id][0];
|
||||
result = input_blocks[first_ct_id * block_size + coef_id];
|
||||
|
||||
for (uint32_t i = 1; i < column_len; ++i) {
|
||||
const uint32_t cur_ct_it = columns[base_id][i];
|
||||
result += input_blocks[cur_ct_it * block_size + coef_id];
|
||||
}
|
||||
}
|
||||
out_radix[base_id * block_size + coef_id] = result;
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
Torus *msb_blocks,
|
||||
@@ -167,6 +335,65 @@ __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
|
||||
(process_msb) ? cur_msb_ct[params::degree] : 0;
|
||||
}
|
||||
}
|
||||
|
||||
inline bool at_least_one_column_needs_processing(
|
||||
const uint64_t *const degrees, const uint32_t num_radix_blocks,
|
||||
const uint32_t num_radix_in_vec, const uint32_t chunk_size) {
|
||||
std::vector<uint32_t> columns_count(num_radix_blocks, 0);
|
||||
|
||||
for (size_t column = 0; column < num_radix_blocks; ++column) {
|
||||
for (size_t block = 0; block < num_radix_in_vec; ++block) {
|
||||
const size_t block_index = block * num_radix_blocks + column;
|
||||
if (degrees[block_index]) {
|
||||
columns_count[column]++;
|
||||
if (columns_count[column] > chunk_size) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline void calculate_final_degrees(uint64_t *const out_degrees,
|
||||
const uint64_t *const input_degrees,
|
||||
size_t num_blocks, size_t num_radix_in_vec,
|
||||
size_t chunk_size,
|
||||
uint64_t message_modulus) {
|
||||
|
||||
auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
|
||||
return std::min(message_modulus - 1, degree);
|
||||
};
|
||||
std::vector<std::queue<uint64_t>> columns(num_blocks);
|
||||
for (size_t i = 0; i < num_radix_in_vec; ++i) {
|
||||
for (size_t j = 0; j < num_blocks; ++j) {
|
||||
if (input_degrees[i * num_blocks + j])
|
||||
columns[j].push(input_degrees[i * num_blocks + j]);
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < num_blocks; ++i) {
|
||||
auto &col = columns[i];
|
||||
while (col.size() > 1) {
|
||||
uint32_t cur_degree = 0;
|
||||
size_t mn = std::min(chunk_size, col.size());
|
||||
for (int j = 0; j < mn; ++j) {
|
||||
cur_degree += col.front();
|
||||
col.pop();
|
||||
}
|
||||
const uint64_t new_degree = get_degree(cur_degree);
|
||||
col.push(new_degree);
|
||||
if ((i + 1) < num_blocks) {
|
||||
columns[i + 1].push(new_degree);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_blocks; i++) {
|
||||
out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -181,6 +408,70 @@ __host__ uint64_t scratch_cuda_integer_partial_sum_ciphertexts_vec_kb(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
void static DEBUG_PRINT_COLUMNS(uint32_t *d_column_data,
|
||||
uint32_t *d_columns_count,
|
||||
uint64_t *d_pbs_indexes_in,
|
||||
uint64_t *d_pbs_indexes_out,
|
||||
uint64_t *d_lut_indexes, int L, int N,
|
||||
int pbs_cnt) {
|
||||
cudaDeviceSynchronize(); // Ensure all device work is done
|
||||
|
||||
std::vector<uint64_t> h_pbs_indexes_in(pbs_cnt);
|
||||
std::vector<uint64_t> h_pbs_indexes_out(pbs_cnt);
|
||||
std::vector<uint64_t> h_lut_indexes(pbs_cnt);
|
||||
|
||||
check_cuda_error(cudaMemcpy(h_pbs_indexes_in.data(), d_pbs_indexes_in,
|
||||
pbs_cnt * sizeof(uint64_t),
|
||||
cudaMemcpyDeviceToHost));
|
||||
check_cuda_error(cudaMemcpy(h_pbs_indexes_out.data(), d_pbs_indexes_out,
|
||||
pbs_cnt * sizeof(uint64_t),
|
||||
cudaMemcpyDeviceToHost));
|
||||
check_cuda_error(cudaMemcpy(h_lut_indexes.data(), d_lut_indexes,
|
||||
pbs_cnt * sizeof(uint64_t),
|
||||
cudaMemcpyDeviceToHost));
|
||||
|
||||
std::vector<uint32_t> h_columns_count(L);
|
||||
check_cuda_error(cudaMemcpy(h_columns_count.data(), d_columns_count,
|
||||
L * sizeof(uint32_t), cudaMemcpyDeviceToHost));
|
||||
|
||||
std::vector<uint32_t> h_column_data(L * N);
|
||||
check_cuda_error(cudaMemcpy(h_column_data.data(), d_column_data,
|
||||
L * N * sizeof(uint32_t),
|
||||
cudaMemcpyDeviceToHost));
|
||||
cudaDeviceSynchronize(); // Ensure all device work is done
|
||||
|
||||
std::cout << "column_counters: ";
|
||||
for (auto a : h_columns_count) {
|
||||
std::cout << a << " ";
|
||||
}
|
||||
|
||||
std::cout << std::endl;
|
||||
|
||||
for (int col = 0; col < L; ++col) {
|
||||
std::cout << "Column[" << col << "]: ";
|
||||
uint32_t count = h_columns_count[col];
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
std::cout << h_column_data[col * N + i] << " ";
|
||||
}
|
||||
std::cout << "\n";
|
||||
}
|
||||
|
||||
printf("pbs_indexes %d\n", pbs_cnt);
|
||||
for (auto a : h_pbs_indexes_in) {
|
||||
printf("%d ", a);
|
||||
}
|
||||
printf("\n");
|
||||
for (auto a : h_pbs_indexes_out) {
|
||||
printf("%d ", a);
|
||||
}
|
||||
printf("\n");
|
||||
for (auto a : h_lut_indexes) {
|
||||
printf("%d ", a);
|
||||
}
|
||||
printf("\n");
|
||||
printf("=========================================================\n");
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
@@ -199,22 +490,30 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
PANIC("Cuda error: input vector does not have enough blocks")
|
||||
if (num_radix_blocks > radix_lwe_out->num_radix_blocks)
|
||||
PANIC("Cuda error: output does not have enough blocks")
|
||||
auto new_blocks = mem_ptr->new_blocks;
|
||||
auto new_blocks_copy = mem_ptr->new_blocks_copy;
|
||||
auto old_blocks = mem_ptr->old_blocks;
|
||||
|
||||
auto current_blocks = mem_ptr->current_blocks;
|
||||
auto small_lwe_vector = mem_ptr->small_lwe_vector;
|
||||
auto d_degrees = mem_ptr->d_degrees;
|
||||
auto d_columns = mem_ptr->d_columns;
|
||||
auto d_columns_counter = mem_ptr->d_columns_counter;
|
||||
auto d_new_columns = mem_ptr->d_new_columns;
|
||||
auto d_new_columns_counter = mem_ptr->d_new_columns_counter;
|
||||
auto d_pbs_indexes_in = mem_ptr->luts_message_carry->lwe_indexes_in;
|
||||
auto d_pbs_indexes_out = mem_ptr->luts_message_carry->lwe_indexes_out;
|
||||
auto d_pbs_counters = mem_ptr->d_pbs_counters;
|
||||
|
||||
auto d_smart_copy_in = mem_ptr->d_smart_copy_in;
|
||||
auto d_smart_copy_out = mem_ptr->d_smart_copy_out;
|
||||
auto luts_message_carry = mem_ptr->luts_message_carry;
|
||||
|
||||
auto message_modulus = mem_ptr->params.message_modulus;
|
||||
auto carry_modulus = mem_ptr->params.carry_modulus;
|
||||
auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
|
||||
auto big_lwe_size = big_lwe_dimension + 1;
|
||||
auto glwe_dimension = mem_ptr->params.glwe_dimension;
|
||||
auto polynomial_size = mem_ptr->params.polynomial_size;
|
||||
auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
|
||||
auto small_lwe_size = small_lwe_dimension + 1;
|
||||
auto helper_streams = mem_ptr->helper_streams;
|
||||
auto chunk_size = mem_ptr->chunk_size;
|
||||
|
||||
size_t total_blocks_in_vec = num_radix_blocks * num_radix_in_vec;
|
||||
|
||||
// In the case of extracting a single LWE this parameters are dummy
|
||||
uint32_t num_many_lut = 1;
|
||||
@@ -228,244 +527,153 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb(
|
||||
terms, 0, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
if (old_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0], old_blocks,
|
||||
terms);
|
||||
}
|
||||
|
||||
if (num_radix_in_vec == 2) {
|
||||
CudaRadixCiphertextFFI old_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks_slice, num_radix_blocks);
|
||||
CudaRadixCiphertextFFI terms_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&terms_slice, terms, num_radix_blocks,
|
||||
2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, terms,
|
||||
&terms_slice, num_radix_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
size_t r = num_radix_in_vec;
|
||||
size_t total_modulus = message_modulus * carry_modulus;
|
||||
size_t message_max = message_modulus - 1;
|
||||
size_t chunk_size = (total_modulus - 1) / message_max;
|
||||
|
||||
size_t h_lwe_idx_in[terms->num_radix_blocks];
|
||||
size_t h_lwe_idx_out[terms->num_radix_blocks];
|
||||
int32_t h_smart_copy_in[terms->num_radix_blocks];
|
||||
int32_t h_smart_copy_out[terms->num_radix_blocks];
|
||||
|
||||
/// Here it is important to query the default max shared memory on device 0
|
||||
/// instead of cuda_get_max_shared_memory,
|
||||
/// to avoid bugs with tree_add_chunks trying to use too much shared memory
|
||||
auto max_shared_memory = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, 0));
|
||||
|
||||
// create lut object for message and carry
|
||||
// we allocate luts_message_carry in the host function (instead of scratch)
|
||||
// to reduce average memory consumption
|
||||
int_radix_lut<Torus> *luts_message_carry;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
if (reused_lut == nullptr) {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_radix_blocks, true, nullptr);
|
||||
} else {
|
||||
luts_message_carry = new int_radix_lut<Torus>(
|
||||
streams, gpu_indexes, gpu_count, mem_ptr->params, 2,
|
||||
2 * ch_amount * num_radix_blocks, reused_lut, true, nullptr);
|
||||
if (current_blocks != terms) {
|
||||
copy_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
|
||||
current_blocks, terms);
|
||||
}
|
||||
auto message_acc = luts_message_carry->get_lut(0, 0);
|
||||
auto carry_acc = luts_message_carry->get_lut(0, 1);
|
||||
|
||||
// define functions for each accumulator
|
||||
auto lut_f_message = [message_modulus](Torus x) -> Torus {
|
||||
return x % message_modulus;
|
||||
};
|
||||
auto lut_f_carry = [message_modulus](Torus x) -> Torus {
|
||||
return x / message_modulus;
|
||||
};
|
||||
cuda_memcpy_async_to_gpu(d_degrees, current_blocks->degrees,
|
||||
total_blocks_in_vec * sizeof(uint64_t), streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
// generate accumulators
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], message_acc,
|
||||
luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0),
|
||||
glwe_dimension, polynomial_size, message_modulus, carry_modulus,
|
||||
lut_f_message, true);
|
||||
generate_device_accumulator<Torus>(
|
||||
streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1),
|
||||
luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size,
|
||||
message_modulus, carry_modulus, lut_f_carry, true);
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
int number_of_threads = 512;
|
||||
int number_of_blocks =
|
||||
(total_blocks_in_vec + number_of_threads - 1) / number_of_threads;
|
||||
|
||||
while (r > 2) {
|
||||
size_t cur_total_blocks = r * num_radix_blocks;
|
||||
size_t ch_amount = r / chunk_size;
|
||||
if (!ch_amount)
|
||||
ch_amount++;
|
||||
dim3 add_grid(ch_amount, num_radix_blocks, 1);
|
||||
radix_vec_to_columns<<<number_of_blocks, number_of_threads, 0, streams[0]>>>(
|
||||
d_columns, d_columns_counter, d_degrees, num_radix_blocks,
|
||||
total_blocks_in_vec);
|
||||
|
||||
cuda_set_device(gpu_indexes[0]);
|
||||
tree_add_chunks<Torus><<<add_grid, 512, 0, streams[0]>>>(
|
||||
(Torus *)new_blocks->ptr, (Torus *)old_blocks->ptr,
|
||||
std::min(r, chunk_size), big_lwe_size, num_radix_blocks);
|
||||
DEBUG_PRINT_COLUMNS(mem_ptr->d_columns_data, d_columns_counter,
|
||||
d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
luts_message_carry->get_lut_indexes(0, 0),
|
||||
num_radix_blocks, num_radix_in_vec, 0);
|
||||
bool needs_processing = at_least_one_column_needs_processing(
|
||||
current_blocks->degrees, num_radix_blocks, num_radix_in_vec, chunk_size);
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
number_of_threads = min(256, params::degree);
|
||||
int part_count = (big_lwe_size + number_of_threads - 1) / number_of_threads;
|
||||
const dim3 number_of_blocks_2d(num_radix_blocks, part_count, 1);
|
||||
|
||||
size_t total_count = 0;
|
||||
size_t message_count = 0;
|
||||
size_t carry_count = 0;
|
||||
size_t sm_copy_count = 0;
|
||||
// h_pbs_counters[0] - total ciphertexts
|
||||
// h_pbs_counters[1] - message ciphertexts
|
||||
// h_pbs_counters[2] - at_leaast_one_column_needs_processing
|
||||
uint32_t *h_pbs_counters = (uint32_t *)malloc(3 * sizeof(uint32_t));
|
||||
|
||||
generate_ids_update_degrees(
|
||||
terms->degrees, h_lwe_idx_in, h_lwe_idx_out, h_smart_copy_in,
|
||||
h_smart_copy_out, ch_amount, r, num_radix_blocks, chunk_size,
|
||||
message_max, total_count, message_count, carry_count, sm_copy_count);
|
||||
auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
|
||||
auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
|
||||
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
|
||||
h_lwe_idx_in, h_lwe_idx_out);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
int DEBUG_I = 0;
|
||||
while (needs_processing) {
|
||||
calculate_chunks<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(current_blocks->ptr), d_columns, d_columns_counter,
|
||||
chunk_size, big_lwe_size);
|
||||
|
||||
size_t copy_size = sm_copy_count * sizeof(int32_t);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
|
||||
streams[0], gpu_indexes[0]);
|
||||
prepare_new_columns_and_pbs_indexes<<<1, num_radix_blocks, 0,
|
||||
helper_streams[0]>>>(
|
||||
d_new_columns, d_new_columns_counter, d_pbs_indexes_in,
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
|
||||
d_pbs_counters, d_columns, d_columns_counter, chunk_size);
|
||||
|
||||
// inside d_smart_copy_in there are only -1 values
|
||||
// it's fine to call smart_copy with same pointer
|
||||
// as source and destination
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks_copy, 0, r * num_radix_blocks,
|
||||
new_blocks, 0, r * num_radix_blocks);
|
||||
smart_copy<Torus><<<sm_copy_count, 1024, 0, streams[0]>>>(
|
||||
(Torus *)new_blocks->ptr, (Torus *)new_blocks_copy->ptr,
|
||||
d_smart_copy_out, d_smart_copy_in, big_lwe_size);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
cuda_memcpy_async_to_cpu(h_pbs_counters, d_pbs_counters,
|
||||
3 * sizeof(uint32_t), helper_streams[0],
|
||||
gpu_indexes[0]);
|
||||
|
||||
if (carry_count > 0)
|
||||
cuda_set_value_async<Torus>(
|
||||
streams[0], gpu_indexes[0],
|
||||
luts_message_carry->get_lut_indexes(0, message_count), 1,
|
||||
carry_count);
|
||||
cuda_synchronize_stream(helper_streams[0], gpu_indexes[0]);
|
||||
|
||||
luts_message_carry->broadcast_lut(streams, gpu_indexes, 0);
|
||||
const uint32_t total_ciphertexts = h_pbs_counters[0];
|
||||
const uint32_t total_messages = h_pbs_counters[1];
|
||||
needs_processing = (h_pbs_counters[2] != 0);
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
|
||||
std::vector<Torus *> small_lwe_vector_vec =
|
||||
luts_message_carry->lwe_after_ks_vec;
|
||||
std::vector<Torus *> lwe_after_pbs_vec =
|
||||
luts_message_carry->lwe_after_pbs_vec;
|
||||
std::vector<Torus *> lwe_trivial_indexes_vec =
|
||||
luts_message_carry->lwe_trivial_indexes_vec;
|
||||
|
||||
auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
lwe_indexes_in, (Torus *)new_blocks->ptr, lwe_indexes_in, ksks,
|
||||
polynomial_size * glwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, message_count);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)new_blocks->ptr, lwe_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
(Torus *)small_lwe_vector->ptr, lwe_indexes_in, bsks,
|
||||
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
if (DEBUG_I % 2 == 0) {
|
||||
DEBUG_PRINT_COLUMNS(
|
||||
mem_ptr->d_new_columns_data, d_new_columns_counter, d_pbs_indexes_in,
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
|
||||
num_radix_blocks, num_radix_in_vec, total_ciphertexts);
|
||||
} else {
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, new_blocks_vec,
|
||||
(Torus *)new_blocks->ptr, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
big_lwe_size);
|
||||
|
||||
/// Apply KS to go from a big LWE dimension to a small LWE dimension
|
||||
/// After this keyswitch execution, we need to synchronize the streams
|
||||
/// because the keyswitch and PBS do not operate on the same number of
|
||||
/// inputs
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
|
||||
ksks, big_lwe_dimension, small_lwe_dimension,
|
||||
mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
|
||||
|
||||
/// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
|
||||
/// different configuration
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, (Torus *)small_lwe_vector->ptr,
|
||||
small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, message_count,
|
||||
small_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
multi_gpu_scatter_lwe_async<Torus>(
|
||||
streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
|
||||
(Torus *)small_lwe_vector->ptr, luts_message_carry->h_lwe_indexes_in,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
small_lwe_size);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
|
||||
lwe_trivial_indexes_vec, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_count, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
multi_gpu_gather_lwe_async<Torus>(
|
||||
streams, gpu_indexes, active_gpu_count, (Torus *)new_blocks->ptr,
|
||||
lwe_after_pbs_vec, luts_message_carry->h_lwe_indexes_out,
|
||||
luts_message_carry->using_trivial_lwe_indexes, total_count,
|
||||
big_lwe_size);
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
for (uint i = 0; i < total_count; i++) {
|
||||
auto degrees_index = luts_message_carry->h_lut_indexes[i];
|
||||
new_blocks->degrees[i] = luts_message_carry->degrees[degrees_index];
|
||||
new_blocks->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
DEBUG_PRINT_COLUMNS(
|
||||
mem_ptr->d_columns_data, d_new_columns_counter, d_pbs_indexes_in,
|
||||
d_pbs_indexes_out, luts_message_carry->get_lut_indexes(0, 0),
|
||||
num_radix_blocks, num_radix_in_vec, total_ciphertexts);
|
||||
}
|
||||
|
||||
int rem_blocks = (r > chunk_size) ? r % chunk_size * num_radix_blocks : 0;
|
||||
int new_blocks_created = 2 * ch_amount * num_radix_blocks;
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
if (rem_blocks > 0)
|
||||
copy_radix_ciphertext_slice_async<Torus>(
|
||||
streams[0], gpu_indexes[0], new_blocks, new_blocks_created,
|
||||
new_blocks_created + rem_blocks, old_blocks,
|
||||
cur_total_blocks - rem_blocks, cur_total_blocks);
|
||||
std::swap(new_blocks, old_blocks);
|
||||
r = (new_blocks_created + rem_blocks) / num_radix_blocks;
|
||||
printf("total_messages: %d\n", total_messages);
|
||||
printf("total_ct: %d\n", total_ciphertexts);
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, (Torus *)current_blocks->ptr, d_pbs_indexes_in, ksks,
|
||||
big_lwe_dimension, small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, total_messages);
|
||||
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr,
|
||||
d_pbs_indexes_out, luts_message_carry->lut_vec,
|
||||
luts_message_carry->lut_indexes_vec, (Torus *)small_lwe_vector->ptr,
|
||||
d_pbs_indexes_in, bsks, ms_noise_reduction_key,
|
||||
luts_message_carry->buffer, glwe_dimension, small_lwe_dimension,
|
||||
polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
total_ciphertexts, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
std::swap(d_columns, d_new_columns);
|
||||
std::swap(d_columns_counter, d_new_columns_counter);
|
||||
++DEBUG_I;
|
||||
}
|
||||
luts_message_carry->release(streams, gpu_indexes, gpu_count);
|
||||
delete (luts_message_carry);
|
||||
|
||||
CudaRadixCiphertextFFI old_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(&old_blocks_slice, old_blocks,
|
||||
calculate_final_chunk_into_radix<Torus>
|
||||
<<<number_of_blocks_2d, number_of_threads, 0, streams[0]>>>(
|
||||
(Torus *)(radix_lwe_out->ptr), (Torus *)(current_blocks->ptr),
|
||||
d_columns, d_columns_counter, chunk_size, big_lwe_size);
|
||||
|
||||
prepare_final_pbs_indexes<Torus>
|
||||
<<<1, 2 * num_radix_blocks, 0, helper_streams[0]>>>(
|
||||
d_pbs_indexes_in, d_pbs_indexes_out,
|
||||
luts_message_carry->get_lut_indexes(0, 0), num_radix_blocks);
|
||||
|
||||
cuda_memset_async(
|
||||
(Torus *)(current_blocks->ptr) + big_lwe_size * num_radix_blocks, 0,
|
||||
big_lwe_size * sizeof(Torus), streams[0], gpu_indexes[0]);
|
||||
|
||||
cuda_synchronize_stream(helper_streams[0], gpu_indexes[0]);
|
||||
|
||||
print_debug<Torus>("indexes_in", d_pbs_indexes_in, 2 * num_radix_blocks);
|
||||
print_debug<Torus>("indexes_out", d_pbs_indexes_out, 2 * num_radix_blocks);
|
||||
print_debug<Torus>("lut_indexes", luts_message_carry->get_lut_indexes(0, 0),
|
||||
2 * num_radix_blocks);
|
||||
|
||||
execute_keyswitch_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)small_lwe_vector->ptr, d_pbs_indexes_in,
|
||||
(Torus *)radix_lwe_out->ptr, d_pbs_indexes_in, ksks, big_lwe_dimension,
|
||||
small_lwe_dimension, mem_ptr->params.ks_base_log,
|
||||
mem_ptr->params.ks_level, num_radix_blocks);
|
||||
|
||||
execute_pbs_async<Torus>(
|
||||
streams, gpu_indexes, 1, (Torus *)current_blocks->ptr, d_pbs_indexes_out,
|
||||
luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
|
||||
(Torus *)small_lwe_vector->ptr, d_pbs_indexes_in, bsks,
|
||||
ms_noise_reduction_key, luts_message_carry->buffer, glwe_dimension,
|
||||
small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log,
|
||||
mem_ptr->params.pbs_level, mem_ptr->params.grouping_factor,
|
||||
2 * num_radix_blocks, mem_ptr->params.pbs_type, num_many_lut, lut_stride);
|
||||
|
||||
CudaRadixCiphertextFFI current_blocks_slice;
|
||||
as_radix_ciphertext_slice<Torus>(¤t_blocks_slice, current_blocks,
|
||||
num_radix_blocks, 2 * num_radix_blocks);
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out, old_blocks,
|
||||
&old_blocks_slice, num_radix_blocks);
|
||||
|
||||
host_addition<Torus>(streams[0], gpu_indexes[0], radix_lwe_out,
|
||||
current_blocks, ¤t_blocks_slice, num_radix_blocks);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
|
||||
@@ -118,7 +118,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_rotated, true);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
auto accumulator = accumulator_rotated;
|
||||
|
||||
if (blockIdx.z == 0) {
|
||||
|
||||
@@ -357,16 +357,19 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
input_lwe_ciphertext_count, max_shared_memory)) {
|
||||
printf("it is cg\n");
|
||||
return scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
else
|
||||
} else {
|
||||
printf("it is default\n");
|
||||
return scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
|
||||
@@ -3235,6 +3235,8 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async<
|
||||
.iter()
|
||||
.map(|b| b.noise_level.0)
|
||||
.collect();
|
||||
|
||||
println!("radix_list_degrees: {:?}", radix_list_degrees);
|
||||
let mut cuda_ffi_radix_list = prepare_cuda_radix_ffi(
|
||||
radix_list,
|
||||
&mut radix_list_degrees,
|
||||
|
||||
@@ -317,6 +317,8 @@ impl CudaServerKey {
|
||||
|
||||
let mut terms = CudaRadixCiphertext::from_radix_ciphertext_vec(ciphertexts, streams);
|
||||
|
||||
println!("terms.blocks.lem: {:?}", terms.info.blocks.len());
|
||||
println!("terms: {:?}", terms.info.blocks.get(0).unwrap().degree);
|
||||
match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async(
|
||||
@@ -394,7 +396,8 @@ impl CudaServerKey {
|
||||
.unchecked_partial_sum_ciphertexts_async(ciphertexts, streams)
|
||||
.unwrap();
|
||||
|
||||
self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
|
||||
//self.propagate_single_carry_assign_async(&mut result, streams, None, OutputFlag::None);
|
||||
//self.full_propagate_assign_async(&mut result, streams);
|
||||
assert!(result.block_carries_are_empty());
|
||||
result
|
||||
}
|
||||
|
||||
@@ -134,6 +134,7 @@ impl ServerKey {
|
||||
};
|
||||
|
||||
if self.is_eligible_for_parallel_single_carry_propagation(blocks.len()) {
|
||||
println!("is_eligible_for_parallel_single_carry_propagation");
|
||||
let highest_degree = blocks
|
||||
.iter()
|
||||
.max_by(|block_a, block_b| block_a.degree.get().cmp(&block_b.degree.get()))
|
||||
@@ -233,8 +234,11 @@ impl ServerKey {
|
||||
.position(|block| !block.carry_is_empty())
|
||||
.unwrap_or(num_blocks);
|
||||
|
||||
println!("start_index: {:?}", start_index);
|
||||
let (to_be_cleaned, to_be_propagated) = ctxt.blocks_mut().split_at_mut(start_index);
|
||||
|
||||
println!("to_be_cleaned.len: {:?}", to_be_cleaned.len());
|
||||
println!("to_be_propagated.len: {:?}", to_be_propagated.len());
|
||||
rayon::scope(|s| {
|
||||
if !to_be_propagated.is_empty() {
|
||||
s.spawn(|_| {
|
||||
|
||||
@@ -73,39 +73,53 @@ impl ServerKey {
|
||||
};
|
||||
|
||||
while at_least_one_column_has_enough_elements(&columns) {
|
||||
columns
|
||||
.par_drain(..)
|
||||
.zip(column_output_buffer.par_iter_mut())
|
||||
for (column_index, (mut column, out_buf)) in columns
|
||||
.drain(..)
|
||||
.zip(column_output_buffer.iter_mut())
|
||||
.enumerate()
|
||||
.map(|(column_index, (mut column, out_buf))| {
|
||||
if column.len() < num_elements_to_fill_carry {
|
||||
return column;
|
||||
{
|
||||
if column.len() < num_elements_to_fill_carry {
|
||||
columns_buffer.push(column);
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut output_pairs = Vec::new();
|
||||
|
||||
for chunk in column.chunks_exact(num_elements_to_fill_carry) {
|
||||
let mut result = chunk[0].clone();
|
||||
for c in &chunk[1..] {
|
||||
self.key.unchecked_add_assign(&mut result, c);
|
||||
}
|
||||
column
|
||||
.par_chunks_exact(num_elements_to_fill_carry)
|
||||
.map(|chunk| {
|
||||
let mut result = chunk[0].clone();
|
||||
for c in &chunk[1..] {
|
||||
self.key.unchecked_add_assign(&mut result, c);
|
||||
}
|
||||
|
||||
if (column_index < num_columns - 1) || output_carries.is_some() {
|
||||
rayon::join(
|
||||
|| self.key.message_extract(&result),
|
||||
|| Some(self.key.carry_extract(&result)),
|
||||
)
|
||||
} else {
|
||||
(self.key.message_extract(&result), None)
|
||||
}
|
||||
})
|
||||
.collect_into_vec(out_buf);
|
||||
println!(
|
||||
"chunk_result: {:?} {:?}",
|
||||
column_index,
|
||||
result.ct.get_body()
|
||||
);
|
||||
|
||||
let num_elem_in_rest = column.len() % num_elements_to_fill_carry;
|
||||
column.rotate_right(num_elem_in_rest);
|
||||
column.truncate(num_elem_in_rest);
|
||||
column
|
||||
})
|
||||
.collect_into_vec(&mut columns_buffer);
|
||||
if (column_index < num_columns - 1) || output_carries.is_some() {
|
||||
println!("Before message_extract: {:?}", result.ct.get_body().data);
|
||||
let msg = self.key.message_extract(&result);
|
||||
println!("After message_extract: {:?}", msg.ct.get_body().data);
|
||||
|
||||
println!("Before carry_extract: {:?}", result.ct.get_body().data);
|
||||
let carry = self.key.carry_extract(&result);
|
||||
println!("After carry_extract: {:?}", carry.ct.get_body().data);
|
||||
|
||||
output_pairs.push((msg, Some(carry)));
|
||||
} else {
|
||||
let msg = self.key.message_extract(&result);
|
||||
output_pairs.push((msg, None));
|
||||
}
|
||||
}
|
||||
|
||||
*out_buf = output_pairs;
|
||||
|
||||
let num_elem_in_rest = column.len() % num_elements_to_fill_carry;
|
||||
column.rotate_right(num_elem_in_rest);
|
||||
column.truncate(num_elem_in_rest);
|
||||
columns_buffer.push(column);
|
||||
}
|
||||
|
||||
std::mem::swap(&mut columns, &mut columns_buffer);
|
||||
|
||||
@@ -123,6 +137,16 @@ impl ServerKey {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut index = 0;
|
||||
for column in &columns {
|
||||
print!("column_{:?} ", index);
|
||||
for ct in column {
|
||||
print!("{:?} ", ct.ct.get_body().data);
|
||||
}
|
||||
println!("");
|
||||
index += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Reconstruct a radix from the columns
|
||||
@@ -136,6 +160,7 @@ impl ServerKey {
|
||||
column.as_mut_slice().split_first_mut().unwrap();
|
||||
for other in other_blocks {
|
||||
self.key.unchecked_add_assign(first_block, other);
|
||||
println!("after_add: {:?}", first_block.ct.get_body());
|
||||
}
|
||||
column.swap_remove(0)
|
||||
}
|
||||
@@ -143,6 +168,9 @@ impl ServerKey {
|
||||
.collect::<Vec<_>>();
|
||||
assert_eq!(blocks.len(), num_blocks);
|
||||
|
||||
for block in &blocks {
|
||||
println!("final_result: {:?} {:?}", block.ct.get_body(), block.degree);
|
||||
}
|
||||
Some(T::from_blocks(blocks))
|
||||
}
|
||||
|
||||
@@ -160,7 +188,7 @@ impl ServerKey {
|
||||
self.unchecked_partial_sum_ciphertexts_vec_parallelized(ciphertexts, None)?;
|
||||
|
||||
self.full_propagate_parallelized(&mut result);
|
||||
assert!(result.block_carries_are_empty());
|
||||
//assert!(result.block_carries_are_empty());
|
||||
|
||||
Some(result)
|
||||
}
|
||||
|
||||
@@ -143,7 +143,8 @@ where
|
||||
T: for<'a> FunctionExecutor<&'a Vec<RadixCiphertext>, Option<RadixCiphertext>>,
|
||||
{
|
||||
let param = param.into();
|
||||
let nb_tests_smaller = nb_tests_smaller_for_params(param);
|
||||
println!("params: {:?}", ¶m);
|
||||
let nb_tests_smaller: usize = 1;
|
||||
let (cks, sks) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
|
||||
let cks = RadixClientKey::from((
|
||||
cks,
|
||||
@@ -160,9 +161,14 @@ where
|
||||
.0
|
||||
.pow(crate::integer::server_key::radix_parallel::tests_unsigned::NB_CTXT as u32);
|
||||
|
||||
println!("modulus, {:?}", modulus);
|
||||
println!(
|
||||
"NB_CTXT, {:?}",
|
||||
crate::integer::server_key::radix_parallel::tests_unsigned::NB_CTXT
|
||||
);
|
||||
executor.setup(&cks, sks);
|
||||
|
||||
for len in [1, 2, 15, 16, 17, 64, 65] {
|
||||
for len in [64] {
|
||||
for _ in 0..nb_tests_smaller {
|
||||
let clears = (0..len)
|
||||
.map(|_| rng.gen::<u64>() % modulus)
|
||||
|
||||
@@ -889,15 +889,18 @@ impl<AP: AtomicPattern> GenericServerKey<AP> {
|
||||
}
|
||||
|
||||
pub fn apply_lookup_table_assign(&self, ct: &mut Ciphertext, acc: &LookupTableOwned) {
|
||||
println!("degree_before: {:?}", ct.degree);
|
||||
if ct.is_trivial() {
|
||||
self.trivial_pbs_assign(ct, acc);
|
||||
println!("degree_after: {:?}", ct.degree);
|
||||
return;
|
||||
}
|
||||
|
||||
self.atomic_pattern.apply_lookup_table_assign(ct, acc);
|
||||
|
||||
ct.degree = acc.degree;
|
||||
ct.set_noise_level_to_nominal();
|
||||
ct.set_noise_level(NoiseLevel::NOMINAL, self.max_noise_level);
|
||||
println!("degree_after: {:?}", ct.degree);
|
||||
}
|
||||
|
||||
/// Compute a keyswitch and programmable bootstrap applying several functions on an input
|
||||
|
||||
Reference in New Issue
Block a user