mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
feat(gpu): add squash noise in the hlapi
This commit is contained in:
@@ -24,7 +24,15 @@ using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
|
||||
return std::get<Torus *>(variant); \
|
||||
} \
|
||||
}()
|
||||
|
||||
// Macro to define the visitor logic using std::holds_alternative for vectors
|
||||
#define GET_VARIANT_ELEMENT_64BIT(variant, index) \
|
||||
[&] { \
|
||||
if (std::holds_alternative<std::vector<uint64_t *>>(variant)) { \
|
||||
return std::get<std::vector<uint64_t *>>(variant)[index]; \
|
||||
} else { \
|
||||
return std::get<uint64_t *>(variant); \
|
||||
} \
|
||||
}()
|
||||
int get_active_gpu_count(int num_inputs, int gpu_count);
|
||||
|
||||
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
|
||||
|
||||
@@ -567,5 +567,27 @@ void cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void cleanup_cuda_integer_radix_scalar_mul_high_kb_64(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_glwe_dimension,
|
||||
uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t num_original_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks);
|
||||
|
||||
void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void);
|
||||
} // extern C
|
||||
#endif // CUDA_INTEGER_H
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
#include "integer/radix_ciphertext.h"
|
||||
#include "keyswitch/keyswitch.h"
|
||||
#include "pbs/programmable_bootstrap.cuh"
|
||||
#include "pbs/programmable_bootstrap_128.cuh"
|
||||
#include "utils/helper_multi_gpu.cuh"
|
||||
#include <cmath>
|
||||
#include <functional>
|
||||
|
||||
@@ -528,10 +530,10 @@ template <typename Torus> struct int_radix_lut {
|
||||
}
|
||||
|
||||
// Return a pointer to idx-ith degree
|
||||
Torus *get_degree(size_t idx) { return °rees[num_many_lut * idx]; }
|
||||
uint64_t *get_degree(size_t idx) { return °rees[num_many_lut * idx]; }
|
||||
|
||||
// Return a pointer to idx-ith max degree
|
||||
Torus *get_max_degree(size_t idx) { return &max_degrees[idx]; }
|
||||
uint64_t *get_max_degree(size_t idx) { return &max_degrees[idx]; }
|
||||
|
||||
// Return a pointer to idx-ith lut indexes at gpu_index's global memory
|
||||
Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
|
||||
@@ -646,6 +648,206 @@ template <typename Torus> struct int_radix_lut {
|
||||
free(max_degrees);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename InputTorus> struct int_noise_squashing_lut {
|
||||
|
||||
int_radix_params params;
|
||||
uint32_t input_glwe_dimension;
|
||||
uint32_t input_polynomial_size;
|
||||
uint32_t input_big_lwe_dimension;
|
||||
uint32_t num_blocks;
|
||||
// Tracks the degree of each LUT and the max degree on CPU
|
||||
// The max degree is (message_modulus * carry_modulus - 1) except for many lut
|
||||
// for which it's different
|
||||
uint64_t *degrees;
|
||||
uint64_t *max_degrees;
|
||||
|
||||
int active_gpu_count;
|
||||
|
||||
// There will be one buffer on each GPU in multi-GPU computations
|
||||
// (same for tmp lwe arrays)
|
||||
std::vector<int8_t *> pbs_buffer;
|
||||
|
||||
std::vector<__uint128_t *> lut_vec;
|
||||
|
||||
uint32_t *gpu_indexes;
|
||||
CudaRadixCiphertextFFI *tmp_lwe_before_ks;
|
||||
|
||||
// All tmp lwe arrays and index arrays for lwe contain the total
|
||||
// amount of blocks to be computed on, there is no split between GPUs
|
||||
// for the moment
|
||||
InputTorus *lwe_indexes_in;
|
||||
|
||||
InputTorus *h_lwe_indexes_in;
|
||||
InputTorus *h_lwe_indexes_out;
|
||||
InputTorus *lwe_trivial_indexes;
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
std::vector<InputTorus *> lwe_array_in_vec;
|
||||
std::vector<InputTorus *> lwe_after_ks_vec;
|
||||
std::vector<__uint128_t *> lwe_after_pbs_vec;
|
||||
std::vector<InputTorus *> lwe_trivial_indexes_vec;
|
||||
|
||||
bool using_trivial_lwe_indexes = true;
|
||||
bool gpu_memory_allocated;
|
||||
// noise squashing constructor
|
||||
int_noise_squashing_lut(cudaStream_t const *streams,
|
||||
uint32_t const *input_gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params,
|
||||
uint32_t input_glwe_dimension,
|
||||
uint32_t input_polynomial_size,
|
||||
uint32_t num_radix_blocks,
|
||||
uint32_t original_num_blocks,
|
||||
bool allocate_gpu_memory, uint64_t *size_tracker) {
|
||||
this->params = params;
|
||||
this->num_blocks = num_radix_blocks;
|
||||
gpu_memory_allocated = allocate_gpu_memory;
|
||||
// This are the glwe dimension and polynomial size before squashing
|
||||
this->input_glwe_dimension = input_glwe_dimension;
|
||||
this->input_polynomial_size = input_polynomial_size;
|
||||
uint32_t input_big_lwe_dimension =
|
||||
input_glwe_dimension * input_polynomial_size;
|
||||
this->input_big_lwe_dimension = input_big_lwe_dimension;
|
||||
|
||||
uint32_t lut_buffer_size = (params.glwe_dimension + 1) *
|
||||
params.polynomial_size * sizeof(__uint128_t);
|
||||
|
||||
gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
|
||||
std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));
|
||||
|
||||
///////////////
|
||||
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_set_device(i);
|
||||
auto num_radix_blocks_on_gpu =
|
||||
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
|
||||
int8_t *gpu_pbs_buffer;
|
||||
uint64_t size = 0;
|
||||
execute_scratch_pbs_128(streams[i], gpu_indexes[i], &gpu_pbs_buffer,
|
||||
params.small_lwe_dimension, params.glwe_dimension,
|
||||
params.polynomial_size, params.pbs_level,
|
||||
num_radix_blocks_on_gpu, allocate_gpu_memory,
|
||||
params.allocate_ms_array, &size);
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
if (i == 0 && size_tracker != nullptr) {
|
||||
*size_tracker += size;
|
||||
}
|
||||
pbs_buffer.push_back(gpu_pbs_buffer);
|
||||
}
|
||||
lwe_indexes_in = (InputTorus *)cuda_malloc_with_size_tracking_async(
|
||||
num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
lwe_trivial_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
|
||||
num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
|
||||
size_tracker, allocate_gpu_memory);
|
||||
h_lwe_indexes_in =
|
||||
(InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));
|
||||
for (int i = 0; i < num_radix_blocks; i++)
|
||||
h_lwe_indexes_in[i] = i;
|
||||
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(InputTorus),
|
||||
streams[0], gpu_indexes[0], allocate_gpu_memory);
|
||||
cuda_memcpy_with_size_tracking_async_to_gpu(
|
||||
lwe_trivial_indexes, h_lwe_indexes_in,
|
||||
num_radix_blocks * sizeof(InputTorus), streams[0], gpu_indexes[0],
|
||||
allocate_gpu_memory);
|
||||
|
||||
multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_array_in_vec, num_radix_blocks,
|
||||
params.big_lwe_dimension + 1, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
|
||||
multi_gpu_alloc_lwe_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
|
||||
num_radix_blocks, params.small_lwe_dimension + 1, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
multi_gpu_alloc_lwe_async<__uint128_t>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
|
||||
num_radix_blocks, params.big_lwe_dimension + 1, size_tracker,
|
||||
allocate_gpu_memory);
|
||||
multi_gpu_alloc_array_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_trivial_indexes_vec,
|
||||
num_radix_blocks, size_tracker, allocate_gpu_memory);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
|
||||
lwe_trivial_indexes_vec, lwe_trivial_indexes,
|
||||
num_radix_blocks, allocate_gpu_memory);
|
||||
if (allocate_gpu_memory) {
|
||||
// Allocate LUT
|
||||
// LUT is used as a trivial encryption and must be initialized outside
|
||||
// this constructor
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
auto lut = (__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
lut_buffer_size, streams[i], gpu_indexes[i], size_tracker,
|
||||
allocate_gpu_memory);
|
||||
lut_vec.push_back(lut);
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
// Keyswitch
|
||||
tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
|
||||
create_zero_radix_ciphertext_async<InputTorus>(
|
||||
streams[0], gpu_indexes[0], tmp_lwe_before_ks, original_num_blocks,
|
||||
input_big_lwe_dimension, size_tracker, allocate_gpu_memory);
|
||||
|
||||
degrees = (uint64_t *)malloc(sizeof(uint64_t));
|
||||
max_degrees = (uint64_t *)malloc(sizeof(uint64_t));
|
||||
|
||||
// lut for the squashing
|
||||
auto f_squash = [](__uint128_t block) -> __uint128_t { return block; };
|
||||
|
||||
// Generate the identity LUT, for now we only use one GPU
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
auto squash_lut = lut_vec[i];
|
||||
generate_device_accumulator<__uint128_t>(
|
||||
streams[i], gpu_indexes[i], squash_lut, degrees, max_degrees,
|
||||
params.glwe_dimension, params.polynomial_size, params.message_modulus,
|
||||
params.carry_modulus, f_squash, allocate_gpu_memory);
|
||||
}
|
||||
}
|
||||
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count) {
|
||||
free(this->gpu_indexes);
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_drop_with_size_tracking_async(lut_vec[i], streams[i], gpu_indexes[i],
|
||||
gpu_memory_allocated);
|
||||
}
|
||||
cuda_drop_with_size_tracking_async(lwe_indexes_in, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_drop_with_size_tracking_async(lwe_trivial_indexes, streams[0],
|
||||
gpu_indexes[0], gpu_memory_allocated);
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
lut_vec.clear();
|
||||
free(h_lwe_indexes_in);
|
||||
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
tmp_lwe_before_ks, gpu_memory_allocated);
|
||||
for (int i = 0; i < pbs_buffer.size(); i++) {
|
||||
cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
|
||||
&pbs_buffer[i]);
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
|
||||
multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
|
||||
for (uint i = 0; i < active_gpu_count; i++)
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
lwe_array_in_vec.clear();
|
||||
lwe_after_ks_vec.clear();
|
||||
lwe_after_pbs_vec.clear();
|
||||
lwe_trivial_indexes_vec.clear();
|
||||
|
||||
delete tmp_lwe_before_ks;
|
||||
pbs_buffer.clear();
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Torus> struct int_bit_extract_luts_buffer {
|
||||
int_radix_params params;
|
||||
int_radix_lut<Torus> *lut;
|
||||
|
||||
@@ -0,0 +1,13 @@
|
||||
#ifndef CUDA_BOOTSTRAP_128_H
|
||||
#define CUDA_BOOTSTRAP_128_H
|
||||
|
||||
#include "pbs_enums.h"
|
||||
#include <stdint.h>
|
||||
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array);
|
||||
|
||||
#endif // CUDA_BOOTSTRAP_128_H
|
||||
@@ -240,14 +240,12 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
|
||||
}
|
||||
};
|
||||
|
||||
template <PBS_TYPE pbs_type> struct pbs_buffer_128;
|
||||
|
||||
template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
template <typename InputTorus, PBS_TYPE pbs_type> struct pbs_buffer_128 {
|
||||
int8_t *d_mem;
|
||||
|
||||
__uint128_t *global_accumulator;
|
||||
double *global_join_buffer;
|
||||
__uint128_t *temp_lwe_array_in;
|
||||
InputTorus *temp_lwe_array_in;
|
||||
uint64_t *trivial_indexes;
|
||||
|
||||
PBS_VARIANT pbs_variant;
|
||||
@@ -265,11 +263,9 @@ template <> struct pbs_buffer_128<PBS_TYPE::CLASSICAL> {
|
||||
this->pbs_variant = pbs_variant;
|
||||
this->uses_noise_reduction = allocate_ms_array;
|
||||
if (allocate_ms_array) {
|
||||
this->temp_lwe_array_in =
|
||||
(__uint128_t *)cuda_malloc_with_size_tracking_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count *
|
||||
sizeof(__uint128_t),
|
||||
stream, gpu_index, size_tracker, allocate_ms_array);
|
||||
this->temp_lwe_array_in = (InputTorus *)cuda_malloc_async(
|
||||
(lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(InputTorus),
|
||||
stream, gpu_index);
|
||||
this->trivial_indexes = (uint64_t *)cuda_malloc_with_size_tracking_async(
|
||||
input_lwe_ciphertext_count * sizeof(uint64_t), stream, gpu_index,
|
||||
size_tracker, allocate_ms_array);
|
||||
@@ -525,6 +521,10 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
|
||||
uint32_t level_count,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_128_cg(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
#ifdef __CUDACC__
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
|
||||
@@ -100,7 +100,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
|
||||
void const *ms_noise_reduction_ptr, int8_t *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples);
|
||||
|
||||
|
||||
@@ -178,10 +178,12 @@ __device__ __forceinline__ double measure_modulus_switch_noise(
|
||||
|
||||
// Each thread processes two elements of the lwe array
|
||||
template <typename Torus>
|
||||
__global__ void improve_noise_modulus_switch(
|
||||
Torus *array_out, const Torus *array_in, const uint64_t *indexes,
|
||||
const Torus *zeros, int lwe_size, int num_zeros, double input_variance,
|
||||
double r_sigma, double bound, uint32_t log_modulus) {
|
||||
__global__ void __launch_bounds__(512)
|
||||
improve_noise_modulus_switch(Torus *array_out, const Torus *array_in,
|
||||
const uint64_t *indexes, const Torus *zeros,
|
||||
int lwe_size, int num_zeros,
|
||||
double input_variance, double r_sigma,
|
||||
double bound, uint32_t log_modulus) {
|
||||
|
||||
// First we will assume size is less than the number of threads per block
|
||||
// I should switch this to dynamic shared memory
|
||||
|
||||
@@ -386,3 +386,69 @@ void reverseArray(uint64_t arr[], size_t n) {
|
||||
end--;
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_mem(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int_radix_params params, int_noise_squashing_lut<uint64_t> **mem_ptr,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t num_radix_blocks, uint32_t original_num_blocks,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*mem_ptr = new int_noise_squashing_lut<uint64_t>(
|
||||
(cudaStream_t *)streams, gpu_indexes, gpu_count, params, glwe_dimension,
|
||||
polynomial_size, num_radix_blocks, original_num_blocks,
|
||||
allocate_gpu_memory, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t input_glwe_dimension,
|
||||
uint32_t input_polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
|
||||
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
|
||||
uint32_t num_radix_blocks, uint32_t original_num_blocks,
|
||||
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
PUSH_RANGE("scratch noise squashing")
|
||||
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
|
||||
glwe_dimension * polynomial_size, lwe_dimension,
|
||||
ks_level, ks_base_log, pbs_level, pbs_base_log,
|
||||
grouping_factor, message_modulus, carry_modulus,
|
||||
allocate_ms_array);
|
||||
|
||||
return scratch_cuda_apply_noise_squashing_mem(
|
||||
streams, gpu_indexes, gpu_count, params,
|
||||
(int_noise_squashing_lut<uint64_t> **)mem_ptr, input_glwe_dimension,
|
||||
input_polynomial_size, num_radix_blocks, original_num_blocks,
|
||||
allocate_gpu_memory);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cuda_apply_noise_squashing_kb(
|
||||
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
|
||||
CudaRadixCiphertextFFI *output_radix_lwe,
|
||||
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
|
||||
void *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *const *bsks) {
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
integer_radix_apply_noise_squashing_kb<uint64_t>(
|
||||
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
|
||||
input_radix_lwe, (int_noise_squashing_lut<uint64_t> *)mem_ptr, bsks,
|
||||
(uint64_t **)ksks, ms_noise_reduction_key);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
void cleanup_cuda_apply_noise_squashing_kb(void *const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count,
|
||||
int8_t **mem_ptr_void) {
|
||||
PUSH_RANGE("cleanup noise squashing")
|
||||
int_noise_squashing_lut<uint64_t> *mem_ptr =
|
||||
(int_noise_squashing_lut<uint64_t> *)(*mem_ptr_void);
|
||||
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include "linear_algebra.h"
|
||||
#include "linearalgebra/addition.cuh"
|
||||
#include "linearalgebra/negation.cuh"
|
||||
#include "pbs/pbs_128_utilities.h"
|
||||
#include "pbs/programmable_bootstrap.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "utils/helper.cuh"
|
||||
@@ -866,7 +867,7 @@ uint64_t generate_lookup_table_with_encoding(
|
||||
memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus));
|
||||
|
||||
auto body = &acc[glwe_dimension * polynomial_size];
|
||||
uint64_t degree = 0;
|
||||
Torus degree = 0;
|
||||
|
||||
// This accumulator extracts the carry bits
|
||||
for (int i = 0; i < input_modulus_sup; i++) {
|
||||
@@ -886,7 +887,7 @@ uint64_t generate_lookup_table_with_encoding(
|
||||
}
|
||||
|
||||
rotate_left<Torus>(body, half_box_size, polynomial_size);
|
||||
return degree;
|
||||
return (uint64_t)degree;
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
@@ -2200,4 +2201,110 @@ void host_single_borrow_propagate(
|
||||
}
|
||||
}
|
||||
|
||||
/// num_radix_blocks corresponds to the number of blocks on which to apply the
|
||||
/// LUT In scalar bitops we use a number of blocks that may be lower or equal to
|
||||
/// the input and output numbers of blocks
|
||||
template <typename InputTorus>
|
||||
__host__ void integer_radix_apply_noise_squashing_kb(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out,
|
||||
CudaRadixCiphertextFFI const *lwe_array_in,
|
||||
int_noise_squashing_lut<InputTorus> *lut, void *const *bsks,
|
||||
InputTorus *const *ksks,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
|
||||
|
||||
PUSH_RANGE("apply noise squashing")
|
||||
auto params = lut->params;
|
||||
auto pbs_type = params.pbs_type;
|
||||
auto big_lwe_dimension = params.big_lwe_dimension;
|
||||
auto small_lwe_dimension = params.small_lwe_dimension;
|
||||
auto ks_level = params.ks_level;
|
||||
auto ks_base_log = params.ks_base_log;
|
||||
auto pbs_level = params.pbs_level;
|
||||
auto pbs_base_log = params.pbs_base_log;
|
||||
auto glwe_dimension = params.glwe_dimension;
|
||||
auto polynomial_size = params.polynomial_size;
|
||||
auto grouping_factor = params.grouping_factor;
|
||||
|
||||
if (lwe_array_out->num_radix_blocks !=
|
||||
(lwe_array_in->num_radix_blocks + 1) / 2)
|
||||
PANIC("Cuda error: num output radix blocks should be "
|
||||
"half ceil the number input radix blocks")
|
||||
|
||||
/// For multi GPU execution we create vectors of pointers for inputs and
|
||||
/// outputs
|
||||
auto lwe_array_pbs_in = lut->tmp_lwe_before_ks;
|
||||
std::vector<InputTorus *> lwe_array_in_vec = lut->lwe_array_in_vec;
|
||||
std::vector<InputTorus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
|
||||
std::vector<__uint128_t *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
|
||||
std::vector<InputTorus *> lwe_trivial_indexes_vec =
|
||||
lut->lwe_trivial_indexes_vec;
|
||||
|
||||
// We know carry is empty so we can pack two blocks in one
|
||||
pack_blocks<InputTorus>(streams[0], gpu_indexes[0], lwe_array_pbs_in,
|
||||
lwe_array_in, lwe_array_in->num_radix_blocks,
|
||||
params.message_modulus);
|
||||
|
||||
// Since the radix ciphertexts are packed, we have to use the num_radix_blocks
|
||||
// from the output ct
|
||||
auto active_gpu_count =
|
||||
get_active_gpu_count(lwe_array_out->num_radix_blocks, gpu_count);
|
||||
if (active_gpu_count == 1) {
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
streams, gpu_indexes, 1, lwe_after_ks_vec[0],
|
||||
lwe_trivial_indexes_vec[0], (InputTorus *)lwe_array_pbs_in->ptr,
|
||||
lut->lwe_indexes_in, ksks, lut->input_big_lwe_dimension,
|
||||
small_lwe_dimension, ks_base_log, ks_level,
|
||||
lwe_array_out->num_radix_blocks);
|
||||
|
||||
/// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
|
||||
/// dimension to a big LWE dimension
|
||||
execute_pbs_128_async<__uint128_t>(
|
||||
streams, gpu_indexes, 1, (__uint128_t *)lwe_array_out->ptr,
|
||||
lut->lut_vec, lwe_after_ks_vec[0], bsks, ms_noise_reduction_key,
|
||||
lut->pbs_buffer, small_lwe_dimension, glwe_dimension, polynomial_size,
|
||||
pbs_base_log, pbs_level, lwe_array_out->num_radix_blocks);
|
||||
} else {
|
||||
/// Make sure all data that should be on GPU 0 is indeed there
|
||||
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
|
||||
|
||||
/// With multiple GPUs we push to the vectors on each GPU then when we
|
||||
/// gather data to GPU 0 we can copy back to the original indexing
|
||||
multi_gpu_scatter_lwe_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
|
||||
(InputTorus *)lwe_array_pbs_in->ptr, lut->h_lwe_indexes_in,
|
||||
lut->using_trivial_lwe_indexes, lwe_array_out->num_radix_blocks,
|
||||
lut->input_big_lwe_dimension + 1);
|
||||
|
||||
execute_keyswitch_async<InputTorus>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_ks_vec,
|
||||
lwe_trivial_indexes_vec, lwe_array_in_vec, lwe_trivial_indexes_vec,
|
||||
ksks, lut->input_big_lwe_dimension, small_lwe_dimension, ks_base_log,
|
||||
ks_level, lwe_array_out->num_radix_blocks);
|
||||
|
||||
execute_pbs_128_async<__uint128_t>(
|
||||
streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, lut->lut_vec,
|
||||
lwe_after_ks_vec, bsks, ms_noise_reduction_key, lut->pbs_buffer,
|
||||
small_lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
|
||||
pbs_level, lwe_array_out->num_radix_blocks);
|
||||
|
||||
/// Copy data back to GPU 0 and release vecs
|
||||
multi_gpu_gather_lwe_async<__uint128_t>(
|
||||
streams, gpu_indexes, active_gpu_count,
|
||||
(__uint128_t *)lwe_array_out->ptr, lwe_after_pbs_vec,
|
||||
(__uint128_t *)lut->h_lwe_indexes_out, lut->using_trivial_lwe_indexes,
|
||||
lwe_array_out->num_radix_blocks, big_lwe_dimension + 1);
|
||||
|
||||
/// Synchronize all GPUs
|
||||
for (uint i = 0; i < active_gpu_count; i++) {
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
}
|
||||
for (uint i = 0; i < lut->num_blocks; i++) {
|
||||
lwe_array_out->degrees[i] = lut->degrees[0];
|
||||
lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL;
|
||||
}
|
||||
POP_RANGE()
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_INTERNAL_INTEGER_CUH
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
#ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
|
||||
#define CUDA_PROGRAMMABLE_BOOTSTRAP_128_CUH
|
||||
#include "pbs/pbs_128_utilities.h"
|
||||
|
||||
static void
|
||||
execute_scratch_pbs_128(void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array,
|
||||
uint64_t *size_tracker_on_gpu) {
|
||||
// The squash noise function receives as input 64-bit integers
|
||||
*size_tracker_on_gpu = scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
template <typename Torus>
|
||||
static void execute_pbs_128_async(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, const LweArrayVariant<__uint128_t> &lwe_array_out,
|
||||
const std::vector<Torus *> lut_vector,
|
||||
const LweArrayVariant<uint64_t> &lwe_array_in,
|
||||
void *const *bootstrapping_keys,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
std::vector<int8_t *> pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
for (uint32_t i = 0; i < gpu_count; i++) {
|
||||
int num_inputs_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
|
||||
|
||||
Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
|
||||
uint64_t *current_lwe_array_in = GET_VARIANT_ELEMENT_64BIT(lwe_array_in, i);
|
||||
void *zeros = nullptr;
|
||||
if (ms_noise_reduction_key != nullptr)
|
||||
zeros = ms_noise_reduction_key->ptr[i];
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out, lut_vector[i],
|
||||
current_lwe_array_in, bootstrapping_keys[i], ms_noise_reduction_key,
|
||||
zeros, pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count, num_inputs_on_gpu);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@@ -665,7 +665,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<uint64_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<const cudaStream_t>(stream), gpu_index,
|
||||
buffer->temp_lwe_array_in,
|
||||
static_cast<uint64_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(lwe_input_indexes),
|
||||
|
||||
@@ -8,124 +8,67 @@ bool has_support_to_cuda_programmable_bootstrap_128_cg(
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
|
||||
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
|
||||
*/
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
return scratch_cuda_programmable_bootstrap_128_vector<uint64_t>(
|
||||
stream, gpu_index,
|
||||
(pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> **)pbs_buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto buffer = (pbs_buffer_128<CLASSICAL> **)pbs_buffer;
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_128_cg(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory)) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
} else {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_128<AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
return scratch_cuda_programmable_bootstrap_128_vector_64(
|
||||
stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory, allocate_ms_array);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus>
|
||||
void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lut_vector, Torus *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<256>>(
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<512>>(
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<1024>>(
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<2048>>(
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_128<AmortizedDegree<4096>>(
|
||||
host_programmable_bootstrap_128<InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
@@ -137,41 +80,42 @@ void executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus>
|
||||
void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lut_vector, Torus *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<256>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<512>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<1024>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<2048>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_cg_128<AmortizedDegree<4096>>(
|
||||
host_programmable_bootstrap_cg_128<InputTorus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out, lut_vector,
|
||||
lwe_array_in, bootstrapping_key, buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples);
|
||||
@@ -183,6 +127,57 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus>
|
||||
void host_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
__uint128_t const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void const *ms_noise_reduction_ptr,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<InputTorus>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
static_cast<InputTorus const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(buffer->trivial_indexes),
|
||||
static_cast<const InputTorus *>(ms_noise_reduction_ptr),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
} else {
|
||||
buffer->temp_lwe_array_in =
|
||||
const_cast<InputTorus *>(static_cast<const InputTorus *>(lwe_array_in));
|
||||
}
|
||||
switch (buffer->pbs_variant) {
|
||||
case DEFAULT:
|
||||
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<InputTorus>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case CG:
|
||||
executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
|
||||
InputTorus>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
lut_vector, static_cast<InputTorus *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/* Perform bootstrapping on a batch of input u128 LWE ciphertexts, storing the
|
||||
* result in the same index for each ciphertext.
|
||||
*
|
||||
@@ -237,57 +232,22 @@ void executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128(
|
||||
*/
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void *streams, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lut_vector, void const *lwe_array_in,
|
||||
void const *bootstrapping_key,
|
||||
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
|
||||
void *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
void const *ms_noise_reduction_ptr, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *buffer =
|
||||
(pbs_buffer_128<uint64_t, PBS_TYPE::CLASSICAL> *)mem_ptr;
|
||||
|
||||
pbs_buffer_128<CLASSICAL> *buffer = (pbs_buffer_128<CLASSICAL> *)mem_ptr;
|
||||
|
||||
// If the parameters contain noise reduction key, then apply it
|
||||
if (ms_noise_reduction_key->num_zeros != 0) {
|
||||
uint32_t log_modulus = log2(polynomial_size) + 1;
|
||||
host_improve_noise_modulus_switch<__uint128_t>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index,
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<__uint128_t const *>(lwe_array_in),
|
||||
static_cast<uint64_t const *>(buffer->trivial_indexes),
|
||||
static_cast<const __uint128_t *>(ms_noise_reduction_ptr),
|
||||
lwe_dimension + 1, num_samples, ms_noise_reduction_key->num_zeros,
|
||||
ms_noise_reduction_key->ms_input_variance,
|
||||
ms_noise_reduction_key->ms_r_sigma, ms_noise_reduction_key->ms_bound,
|
||||
log_modulus);
|
||||
} else {
|
||||
buffer->temp_lwe_array_in = const_cast<__uint128_t *>(
|
||||
static_cast<const __uint128_t *>(lwe_array_in));
|
||||
}
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case DEFAULT:
|
||||
executor_cuda_programmable_bootstrap_lwe_ciphertext_vector_128<__uint128_t>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
case CG:
|
||||
executor_cuda_programmable_bootstrap_cg_lwe_ciphertext_vector_128<
|
||||
__uint128_t>(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<__uint128_t *>(buffer->temp_lwe_array_in),
|
||||
static_cast<const double *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
}
|
||||
host_programmable_bootstrap_lwe_ciphertext_vector_128<uint64_t>(
|
||||
streams, gpu_index, lwe_array_out,
|
||||
static_cast<const __uint128_t *>(lut_vector), lwe_array_in,
|
||||
bootstrapping_key, ms_noise_reduction_key, ms_noise_reduction_ptr, buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
|
||||
num_samples);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -296,6 +256,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
*/
|
||||
void cleanup_cuda_programmable_bootstrap_128(void *stream, uint32_t gpu_index,
|
||||
int8_t **buffer) {
|
||||
auto x = (pbs_buffer_128<CLASSICAL> *)(*buffer);
|
||||
auto x = (pbs_buffer_128<__uint128_t, PBS_TYPE::CLASSICAL> *)(*buffer);
|
||||
x->release(static_cast<cudaStream_t>(stream), gpu_index);
|
||||
}
|
||||
|
||||
@@ -74,16 +74,17 @@ __device__ void mul_ggsw_glwe_in_fourier_domain_128(
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD,
|
||||
bool first_iter>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_programmable_bootstrap_step_one_128(
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const double *__restrict__ bootstrapping_key, Torus *global_accumulator,
|
||||
double *global_join_buffer, uint32_t lwe_iteration,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
uint64_t device_memory_size_per_block) {
|
||||
const __uint128_t *__restrict__ lut_vector,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const double *__restrict__ bootstrapping_key,
|
||||
__uint128_t *global_accumulator, double *global_join_buffer,
|
||||
uint32_t lwe_iteration, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
int8_t *device_mem, uint64_t device_memory_size_per_block) {
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
@@ -100,22 +101,22 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
}
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
__uint128_t *accumulator = (__uint128_t *)selected_memory;
|
||||
double *accumulator_fft =
|
||||
(double *)accumulator +
|
||||
(ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double));
|
||||
(ptrdiff_t)(sizeof(__uint128_t) * polynomial_size / sizeof(double));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector = lut_vector;
|
||||
const __uint128_t *block_lut_vector = lut_vector;
|
||||
|
||||
Torus *global_slice =
|
||||
__uint128_t *global_slice =
|
||||
global_accumulator +
|
||||
(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
|
||||
|
||||
@@ -127,12 +128,12 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
if constexpr (first_iter) {
|
||||
// First iteration
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
InputTorus b_hat = 0;
|
||||
modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
// The y-dimension is used to select the element of the GLWE this block will
|
||||
// compute
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
@@ -146,20 +147,21 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
InputTorus a_hat = 0;
|
||||
modulus_switch<InputTorus>(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree +
|
||||
1); // 2 * params::log2_degree + 1);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(global_slice,
|
||||
accumulator, a_hat);
|
||||
__uint128_t, params::opt, params::degree / params::opt>(
|
||||
global_slice, accumulator, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
init_decomposer_state_inplace<__uint128_t, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, base_log, level_count);
|
||||
|
||||
@@ -168,7 +170,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
|
||||
GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
|
||||
accumulator);
|
||||
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
@@ -314,10 +317,10 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
*
|
||||
* Each y-block computes one element of the lwe_array_out.
|
||||
*/
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_cg_128(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
__uint128_t *lwe_array_out, const __uint128_t *__restrict__ lut_vector,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const double *__restrict__ bootstrapping_key, double *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, int8_t *device_mem,
|
||||
@@ -342,23 +345,22 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
|
||||
// We always compute the pointer with most restrictive alignment to avoid
|
||||
// alignment issues
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated =
|
||||
(Torus *)accumulator + (ptrdiff_t)(polynomial_size);
|
||||
__uint128_t *accumulator = (__uint128_t *)selected_memory;
|
||||
__uint128_t *accumulator_rotated =
|
||||
(__uint128_t *)accumulator + (ptrdiff_t)(polynomial_size);
|
||||
double *accumulator_fft =
|
||||
(double *)(accumulator_rotated) +
|
||||
(ptrdiff_t)(polynomial_size * sizeof(Torus) / sizeof(double));
|
||||
(ptrdiff_t)(polynomial_size * sizeof(__uint128_t) / sizeof(double));
|
||||
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
&lut_vector[blockIdx.x * params::degree * (glwe_dimension + 1)];
|
||||
const __uint128_t *block_lut_vector = lut_vector;
|
||||
|
||||
double *block_join_buffer =
|
||||
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
|
||||
@@ -368,11 +370,11 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
InputTorus b_hat = 0;
|
||||
modulus_switch<InputTorus>(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
divide_by_monomial_negacyclic_inplace<__uint128_t, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
@@ -381,17 +383,18 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
__syncthreads();
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
|
||||
InputTorus a_hat = 0;
|
||||
modulus_switch<InputTorus>(block_lwe_array_in[i], a_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
__uint128_t, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
init_decomposer_state_inplace<Torus, params::opt,
|
||||
init_decomposer_state_inplace<__uint128_t, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, level_count);
|
||||
|
||||
@@ -400,8 +403,8 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
GadgetMatrix<__uint128_t, params> gadget_acc(base_log, level_count,
|
||||
accumulator_rotated);
|
||||
gadget_acc.decompose_and_compress_level_128(accumulator_fft, blockIdx.z);
|
||||
|
||||
auto acc_fft_re_hi = accumulator_fft + 0 * params::degree / 2;
|
||||
@@ -420,8 +423,9 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi, acc_fft_im_lo);
|
||||
__syncthreads();
|
||||
|
||||
add_to_torus_128<Torus, params>(acc_fft_re_hi, acc_fft_re_lo, acc_fft_im_hi,
|
||||
acc_fft_im_lo, accumulator);
|
||||
add_to_torus_128<__uint128_t, params>(acc_fft_re_hi, acc_fft_re_lo,
|
||||
acc_fft_im_hi, acc_fft_im_lo,
|
||||
accumulator);
|
||||
}
|
||||
|
||||
auto block_lwe_array_out =
|
||||
@@ -433,17 +437,20 @@ __global__ void device_programmable_bootstrap_cg_128(
|
||||
// Perform a sample extract. At this point, all blocks have the result,
|
||||
// but we do the computation at block 0 to avoid waiting for extra blocks,
|
||||
// in case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
|
||||
sample_extract_mask<__uint128_t, params>(block_lwe_array_out,
|
||||
accumulator);
|
||||
|
||||
} else if (blockIdx.y == glwe_dimension) {
|
||||
sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
|
||||
sample_extract_body<__uint128_t, params>(block_lwe_array_out, accumulator,
|
||||
0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename params>
|
||||
template <typename InputTorus, typename params>
|
||||
__host__ uint64_t scratch_programmable_bootstrap_cg_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
@@ -457,33 +464,34 @@ __host__ uint64_t scratch_programmable_bootstrap_cg_128(
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer_128<CLASSICAL>(
|
||||
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename params>
|
||||
template <typename InputTorus, typename params>
|
||||
__host__ uint64_t scratch_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, pbs_buffer_128<CLASSICAL> **buffer,
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
@@ -504,37 +512,37 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params,
|
||||
PARTIALSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params,
|
||||
PARTIALSM, true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params,
|
||||
PARTIALSM, false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params,
|
||||
PARTIALSM, false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
@@ -580,17 +588,122 @@ __host__ uint64_t scratch_programmable_bootstrap_128(
|
||||
}
|
||||
|
||||
uint64_t size_tracker = 0;
|
||||
*buffer = new pbs_buffer_128<CLASSICAL>(
|
||||
*buffer = new pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL>(
|
||||
stream, gpu_index, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT,
|
||||
allocate_gpu_memory, allocate_ms_array, &size_tracker);
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <class params, bool first_iter>
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the PBS on 128 bits inputs, into `buffer`. It also configures SM options on
|
||||
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
|
||||
*/
|
||||
template <typename InputTorus>
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128_vector(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, bool allocate_ms_array) {
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
auto buffer = (pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> **)pbs_buffer;
|
||||
|
||||
if (has_support_to_cuda_programmable_bootstrap_128_cg(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory)) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_cg_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS128): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
} else {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, allocate_ms_array);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus, class params, bool first_iter>
|
||||
__host__ void execute_step_one_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
__uint128_t *lwe_array_in, double const *bootstrapping_key,
|
||||
InputTorus *lwe_array_in, double const *bootstrapping_key,
|
||||
__uint128_t *global_accumulator, double *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -603,21 +716,21 @@ __host__ void execute_step_one_128(
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, NOSM,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, NOSM,
|
||||
first_iter>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
|
||||
base_log, level_count, d_mem, full_dm);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, PARTIALSM,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, PARTIALSM,
|
||||
first_iter>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, lwe_iteration, lwe_dimension, polynomial_size,
|
||||
base_log, level_count, d_mem, partial_dm);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one_128<__uint128_t, params, FULLSM,
|
||||
device_programmable_bootstrap_step_one_128<InputTorus, params, FULLSM,
|
||||
first_iter>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lwe_array_in, bootstrapping_key, global_accumulator,
|
||||
@@ -670,11 +783,12 @@ __host__ void execute_step_two_128(
|
||||
/*
|
||||
* Host wrapper to the programmable bootstrap 128
|
||||
*/
|
||||
template <class params>
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void host_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, __uint128_t *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *pbs_buffer,
|
||||
__uint128_t const *lut_vector, InputTorus *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
@@ -704,14 +818,14 @@ __host__ void host_programmable_bootstrap_128(
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
if (i == 0) {
|
||||
execute_step_one_128<params, true>(
|
||||
execute_step_one_128<InputTorus, params, true>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
|
||||
global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
|
||||
d_mem, i, partial_sm, partial_dm_step_one, full_sm_step_one,
|
||||
full_dm_step_one);
|
||||
} else {
|
||||
execute_step_one_128<params, false>(
|
||||
execute_step_one_128<InputTorus, params, false>(
|
||||
stream, gpu_index, lut_vector, lwe_array_in, bootstrapping_key,
|
||||
global_accumulator, global_join_buffer, input_lwe_ciphertext_count,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count,
|
||||
@@ -736,11 +850,12 @@ __host__ void host_programmable_bootstrap_128(
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void host_programmable_bootstrap_cg_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
__uint128_t const *lut_vector, __uint128_t const *lwe_array_in,
|
||||
double const *bootstrapping_key, pbs_buffer_128<CLASSICAL> *buffer,
|
||||
__uint128_t const *lut_vector, InputTorus const *lwe_array_in,
|
||||
double const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, PBS_TYPE::CLASSICAL> *buffer,
|
||||
uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count) {
|
||||
@@ -783,20 +898,20 @@ __host__ void host_programmable_bootstrap_cg_128(
|
||||
if (max_shared_memory < partial_sm) {
|
||||
kernel_args[10] = &full_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg_128<__uint128_t, params, NOSM>,
|
||||
(void *)device_programmable_bootstrap_cg_128<InputTorus, params, NOSM>,
|
||||
grid, thds, (void **)kernel_args, 0, stream));
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
kernel_args[10] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg_128<__uint128_t, params,
|
||||
PARTIALSM>,
|
||||
(void *)
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, PARTIALSM>,
|
||||
grid, thds, (void **)kernel_args, partial_sm, stream));
|
||||
} else {
|
||||
int no_dm = 0;
|
||||
kernel_args[10] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)
|
||||
device_programmable_bootstrap_cg_128<__uint128_t, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg_128<InputTorus, params, FULLSM>,
|
||||
grid, thds, (void **)kernel_args, full_sm, stream));
|
||||
}
|
||||
|
||||
|
||||
@@ -62,6 +62,12 @@ void multi_gpu_alloc_lwe_async(cudaStream_t const *streams,
|
||||
}
|
||||
}
|
||||
|
||||
template void multi_gpu_alloc_lwe_async<__uint128_t>(
|
||||
cudaStream_t const *streams, uint32_t const *gpu_indexes,
|
||||
uint32_t gpu_count, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
|
||||
uint32_t lwe_size, uint64_t *size_tracker_on_gpu_0,
|
||||
bool allocate_gpu_memory);
|
||||
|
||||
/// Allocates the input/output vector for all devices
|
||||
/// Initializes also the related indexing and initializes it to the trivial
|
||||
/// index
|
||||
@@ -219,5 +225,9 @@ void multi_gpu_release_async(cudaStream_t const *streams,
|
||||
for (uint i = 0; i < vec.size(); i++)
|
||||
cuda_drop_async(vec[i], streams[i], gpu_indexes[i]);
|
||||
}
|
||||
template void
|
||||
multi_gpu_release_async<__uint128_t>(cudaStream_t const *streams,
|
||||
uint32_t const *gpu_indexes,
|
||||
std::vector<__uint128_t *> &vec);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1316,6 +1316,14 @@ unsafe extern "C" {
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn trim_radix_blocks_lsb_64(
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
@@ -1325,11 +1333,49 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn extend_radix_with_trivial_zero_blocks_msb_64(
|
||||
output: *mut CudaRadixCiphertextFFI,
|
||||
input: *const CudaRadixCiphertextFFI,
|
||||
pub fn scratch_cuda_apply_noise_squashing_kb(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
input_glwe_dimension: u32,
|
||||
input_polynomial_size: u32,
|
||||
ks_level: u32,
|
||||
ks_base_log: u32,
|
||||
pbs_level: u32,
|
||||
pbs_base_log: u32,
|
||||
grouping_factor: u32,
|
||||
num_radix_blocks: u32,
|
||||
num_original_blocks: u32,
|
||||
message_modulus: u32,
|
||||
carry_modulus: u32,
|
||||
pbs_type: PBS_TYPE,
|
||||
allocate_gpu_memory: bool,
|
||||
allocate_ms_array: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_apply_noise_squashing_kb(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
output_radix_lwe: *mut CudaRadixCiphertextFFI,
|
||||
input_radix_lwe: *const CudaRadixCiphertextFFI,
|
||||
mem_ptr: *mut i8,
|
||||
ksks: *const *mut ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
bsks: *const *mut ffi::c_void,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_apply_noise_squashing_kb(
|
||||
streams: *const *mut ffi::c_void,
|
||||
gpu_indexes: *const u32,
|
||||
gpu_count: u32,
|
||||
mem_ptr_void: *mut *mut i8,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
@@ -1927,7 +1973,7 @@ unsafe extern "C" {
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
|
||||
ms_noise_reduction_ptr: *mut ffi::c_void,
|
||||
ms_noise_reduction_ptr: *const ffi::c_void,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
|
||||
@@ -177,7 +177,10 @@ mod cuda {
|
||||
};
|
||||
use tfhe::core_crypto::prelude::*;
|
||||
use tfhe::shortint::engine::ShortintEngine;
|
||||
use tfhe::shortint::parameters::ModulusSwitchNoiseReductionParams;
|
||||
use tfhe::shortint::parameters::{
|
||||
NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
use tfhe::shortint::server_key::ModulusSwitchNoiseReductionKey;
|
||||
|
||||
fn cuda_pbs_128(c: &mut Criterion) {
|
||||
@@ -188,26 +191,12 @@ mod cuda {
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
|
||||
type Scalar = u128;
|
||||
let input_params = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
let squash_params = NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
let lwe_dimension = LweDimension(879);
|
||||
let glwe_dimension = GlweDimension(2);
|
||||
let polynomial_size = PolynomialSize(2048);
|
||||
let lwe_noise_distribution = DynamicDistribution::new_t_uniform(46);
|
||||
let lwe_noise_distribution_u128: DynamicDistribution<u128> =
|
||||
DynamicDistribution::new_t_uniform(46);
|
||||
let glwe_noise_distribution = DynamicDistribution::new_t_uniform(30);
|
||||
let pbs_base_log = DecompositionBaseLog(24);
|
||||
let pbs_level = DecompositionLevelCount(3);
|
||||
let ciphertext_modulus = CiphertextModulus::new_native();
|
||||
let lwe_noise_distribution_u64 = DynamicDistribution::new_t_uniform(46);
|
||||
let ct_modulus_u64: CiphertextModulus<u64> = CiphertextModulus::new_native();
|
||||
|
||||
let modulus_switch_noise_reduction_params = ModulusSwitchNoiseReductionParams {
|
||||
modulus_switch_zeros_count: LweCiphertextCount(1449),
|
||||
ms_bound: NoiseEstimationMeasureBound(288230376151711744f64),
|
||||
ms_r_sigma_factor: RSigmaFactor(13.179852282053789f64),
|
||||
ms_input_variance: Variance(2.63039184094559E-7f64),
|
||||
};
|
||||
|
||||
let params_name = "PARAMS_SWITCH_SQUASH";
|
||||
|
||||
let mut boxed_seeder = new_seeder();
|
||||
@@ -220,20 +209,11 @@ mod cuda {
|
||||
EncryptionRandomGenerator::<DefaultRandomGenerator>::new(seeder.seed(), seeder);
|
||||
|
||||
let input_lwe_secret_key =
|
||||
LweSecretKey::generate_new_binary(lwe_dimension, &mut secret_generator);
|
||||
|
||||
let input_lwe_secret_key_u128 = LweSecretKey::from_container(
|
||||
input_lwe_secret_key
|
||||
.as_ref()
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|x| x as u128)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
LweSecretKey::generate_new_binary(input_params.lwe_dimension, &mut secret_generator);
|
||||
|
||||
let output_glwe_secret_key = GlweSecretKey::<Vec<Scalar>>::generate_new_binary(
|
||||
glwe_dimension,
|
||||
polynomial_size,
|
||||
squash_params.glwe_dimension,
|
||||
squash_params.polynomial_size,
|
||||
&mut secret_generator,
|
||||
);
|
||||
|
||||
@@ -241,29 +221,32 @@ mod cuda {
|
||||
|
||||
let bsk = LweBootstrapKey::new(
|
||||
Scalar::ZERO,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
polynomial_size,
|
||||
pbs_base_log,
|
||||
pbs_level,
|
||||
lwe_dimension,
|
||||
ciphertext_modulus,
|
||||
squash_params.glwe_dimension.to_glwe_size(),
|
||||
squash_params.polynomial_size,
|
||||
squash_params.decomp_base_log,
|
||||
squash_params.decomp_level_count,
|
||||
LweDimension(input_params.lwe_dimension.0),
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
|
||||
let mut engine = ShortintEngine::new();
|
||||
|
||||
let modulus_switch_noise_reduction_key = Some(ModulusSwitchNoiseReductionKey::new(
|
||||
modulus_switch_noise_reduction_params,
|
||||
&input_lwe_secret_key,
|
||||
&mut engine,
|
||||
CiphertextModulus::new_native(),
|
||||
lwe_noise_distribution,
|
||||
));
|
||||
|
||||
let modulus_switch_noise_reduction_key = squash_params
|
||||
.modulus_switch_noise_reduction_params
|
||||
.map(|modulus_switch_noise_reduction_params| {
|
||||
ModulusSwitchNoiseReductionKey::new(
|
||||
modulus_switch_noise_reduction_params,
|
||||
&input_lwe_secret_key,
|
||||
&mut engine,
|
||||
input_params.ciphertext_modulus,
|
||||
input_params.lwe_noise_distribution,
|
||||
)
|
||||
});
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
|
||||
|
||||
let message_modulus: Scalar = 1 << 4;
|
||||
let input_message: Scalar = 3;
|
||||
let delta: Scalar = (1 << (Scalar::BITS - 1)) / message_modulus;
|
||||
let message_modulus: u64 = 1 << 4;
|
||||
let input_message: u64 = 3;
|
||||
let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
|
||||
let plaintext = Plaintext(input_message * delta);
|
||||
|
||||
let bench_id;
|
||||
@@ -277,12 +260,12 @@ mod cuda {
|
||||
&streams,
|
||||
);
|
||||
|
||||
let lwe_ciphertext_in: LweCiphertextOwned<Scalar> =
|
||||
let lwe_ciphertext_in: LweCiphertextOwned<u64> =
|
||||
allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&input_lwe_secret_key_u128,
|
||||
&input_lwe_secret_key,
|
||||
plaintext,
|
||||
lwe_noise_distribution_u128,
|
||||
ciphertext_modulus,
|
||||
lwe_noise_distribution_u64,
|
||||
ct_modulus_u64,
|
||||
&mut encryption_generator,
|
||||
);
|
||||
let lwe_ciphertext_in_gpu =
|
||||
@@ -290,9 +273,9 @@ mod cuda {
|
||||
|
||||
let accumulator: GlweCiphertextOwned<Scalar> = GlweCiphertextOwned::new(
|
||||
Scalar::ONE,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
polynomial_size,
|
||||
ciphertext_modulus,
|
||||
squash_params.glwe_dimension.to_glwe_size(),
|
||||
squash_params.polynomial_size,
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
let accumulator_gpu =
|
||||
CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &streams);
|
||||
@@ -300,7 +283,7 @@ mod cuda {
|
||||
let out_pbs_ct = LweCiphertext::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
ciphertext_modulus,
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
let mut out_pbs_ct_gpu =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&out_pbs_ct, &streams);
|
||||
@@ -337,22 +320,22 @@ mod cuda {
|
||||
let local_streams = cuda_local_streams_core();
|
||||
|
||||
let plaintext_list =
|
||||
PlaintextList::new(Scalar::ZERO, PlaintextCount(elements_per_stream));
|
||||
PlaintextList::new(u64::ZERO, PlaintextCount(elements_per_stream));
|
||||
|
||||
let input_cts = (0..gpu_count)
|
||||
.map(|i| {
|
||||
let mut input_ct_list = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
u64::ZERO,
|
||||
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
ciphertext_modulus,
|
||||
ct_modulus_u64,
|
||||
);
|
||||
|
||||
encrypt_lwe_ciphertext_list(
|
||||
&input_lwe_secret_key_u128,
|
||||
&input_lwe_secret_key,
|
||||
&mut input_ct_list,
|
||||
&plaintext_list,
|
||||
lwe_noise_distribution_u128,
|
||||
lwe_noise_distribution_u64,
|
||||
&mut encryption_generator,
|
||||
);
|
||||
|
||||
@@ -367,9 +350,9 @@ mod cuda {
|
||||
.map(|i| {
|
||||
let accumulator = GlweCiphertextOwned::new(
|
||||
Scalar::ONE,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
polynomial_size,
|
||||
ciphertext_modulus,
|
||||
squash_params.glwe_dimension.to_glwe_size(),
|
||||
squash_params.polynomial_size,
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
CudaGlweCiphertextList::from_glwe_ciphertext(
|
||||
&accumulator,
|
||||
@@ -385,7 +368,7 @@ mod cuda {
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
ciphertext_modulus,
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
CudaLweCiphertextList::from_lwe_ciphertext_list(
|
||||
&output_ct_list,
|
||||
@@ -428,14 +411,14 @@ mod cuda {
|
||||
};
|
||||
|
||||
let params_record = CryptoParametersRecord {
|
||||
lwe_dimension: Some(lwe_dimension),
|
||||
glwe_dimension: Some(glwe_dimension),
|
||||
polynomial_size: Some(polynomial_size),
|
||||
lwe_noise_distribution: Some(lwe_noise_distribution),
|
||||
glwe_noise_distribution: Some(glwe_noise_distribution),
|
||||
pbs_base_log: Some(pbs_base_log),
|
||||
pbs_level: Some(pbs_level),
|
||||
ciphertext_modulus: Some(ct_modulus_u64),
|
||||
lwe_dimension: Some(input_params.lwe_dimension),
|
||||
glwe_dimension: Some(squash_params.glwe_dimension),
|
||||
polynomial_size: Some(squash_params.polynomial_size),
|
||||
lwe_noise_distribution: Some(lwe_noise_distribution_u64),
|
||||
glwe_noise_distribution: Some(input_params.glwe_noise_distribution),
|
||||
pbs_base_log: Some(squash_params.decomp_base_log),
|
||||
pbs_level: Some(squash_params.decomp_level_count),
|
||||
ciphertext_modulus: Some(input_params.ciphertext_modulus),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
|
||||
@@ -3,8 +3,6 @@ use super::misc::check_encrypted_content_respects_mod;
|
||||
use crate::core_crypto::algorithms::misc::divide_round;
|
||||
use crate::core_crypto::keycache::KeyCacheAccess;
|
||||
use crate::core_crypto::prelude::*;
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::shortint::parameters::ModulusSwitchNoiseReductionParams;
|
||||
pub(crate) use params::*;
|
||||
use std::fmt::Debug;
|
||||
|
||||
@@ -431,25 +429,6 @@ pub const FFT_WOPBS_N2048_PARAMS: FftWopPbsTestParams<u64> = FftWopPbsTestParams
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
pub const NOISESQUASHING128_U128_GPU_PARAMS: NoiseSquashingTestParams<u128> =
|
||||
NoiseSquashingTestParams {
|
||||
lwe_dimension: LweDimension(879),
|
||||
glwe_dimension: GlweDimension(2),
|
||||
polynomial_size: PolynomialSize(2048),
|
||||
lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
|
||||
glwe_noise_distribution: DynamicDistribution::new_t_uniform(30),
|
||||
pbs_base_log: DecompositionBaseLog(24),
|
||||
pbs_level: DecompositionLevelCount(3),
|
||||
modulus_switch_noise_reduction_params: Some(ModulusSwitchNoiseReductionParams {
|
||||
modulus_switch_zeros_count: LweCiphertextCount(1449),
|
||||
ms_bound: NoiseEstimationMeasureBound(288230376151711744f64),
|
||||
ms_r_sigma_factor: RSigmaFactor(13.179852282053789f64),
|
||||
ms_input_variance: Variance(2.63039184094559E-7f64),
|
||||
}),
|
||||
ciphertext_modulus: CiphertextModulus::new_native(),
|
||||
};
|
||||
|
||||
pub fn get_encoding_with_padding<Scalar: UnsignedInteger>(
|
||||
ciphertext_modulus: CiphertextModulus<Scalar>,
|
||||
) -> Scalar {
|
||||
|
||||
@@ -156,7 +156,7 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
|
||||
/// be dropped until streams is synchronised
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
input: &CudaLweCiphertextList<u64>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
num_samples: LweCiphertextCount,
|
||||
@@ -219,15 +219,7 @@ pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
input.ciphertext_modulus(),
|
||||
output.ciphertext_modulus(),
|
||||
"Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
|
||||
input.ciphertext_modulus(),
|
||||
output.ciphertext_modulus(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
input.ciphertext_modulus(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
"Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
|
||||
input.ciphertext_modulus(),
|
||||
@@ -323,7 +315,7 @@ pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
/// index** in `output`.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn cuda_programmable_bootstrap_128_lwe_ciphertext<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
input: &CudaLweCiphertextList<u64>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
num_samples: LweCiphertextCount,
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
pub(crate) use crate::core_crypto::algorithms::test::gen_keys_or_get_from_cache_if_enabled;
|
||||
use crate::shortint::parameters::DynamicDistribution;
|
||||
use crate::shortint::prelude::{DecompositionBaseLog, LweDimension};
|
||||
|
||||
use crate::core_crypto::algorithms::par_allocate_and_generate_new_lwe_bootstrap_key;
|
||||
use crate::core_crypto::algorithms::test::{FftBootstrapKeys, TestResources};
|
||||
use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
|
||||
@@ -7,16 +10,32 @@ use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::vec::GpuIndex;
|
||||
use crate::core_crypto::gpu::{cuda_programmable_bootstrap_128_lwe_ciphertext, CudaStreams};
|
||||
|
||||
use crate::core_crypto::keycache::KeyCacheAccess;
|
||||
use crate::core_crypto::prelude::test::{
|
||||
NoiseSquashingTestParams, NOISESQUASHING128_U128_GPU_PARAMS,
|
||||
use crate::core_crypto::prelude::test::NoiseSquashingTestParams;
|
||||
use crate::core_crypto::prelude::{
|
||||
allocate_and_encrypt_new_lwe_ciphertext, decrypt_lwe_ciphertext,
|
||||
generate_programmable_bootstrap_glwe_lut, CastFrom, CastInto, DecompositionLevelCount,
|
||||
GlweCiphertextOwned, GlweSecretKey, LweCiphertextCount, LweCiphertextOwned, LweSecretKey,
|
||||
Plaintext, SignedDecomposer, UnsignedTorus,
|
||||
};
|
||||
use crate::core_crypto::prelude::*;
|
||||
use crate::shortint::engine::ShortintEngine;
|
||||
use crate::shortint::parameters::{
|
||||
NoiseSquashingParameters, NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
use crate::shortint::server_key::ModulusSwitchNoiseReductionKey;
|
||||
use crate::shortint::ClassicPBSParameters;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::Serialize;
|
||||
|
||||
pub fn generate_keys<
|
||||
Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + Serialize + DeserializeOwned,
|
||||
Scalar: UnsignedTorus
|
||||
+ Sync
|
||||
+ Send
|
||||
+ CastFrom<usize>
|
||||
+ CastFrom<u64>
|
||||
+ CastInto<usize>
|
||||
+ Serialize
|
||||
+ DeserializeOwned,
|
||||
>(
|
||||
params: NoiseSquashingTestParams<Scalar>,
|
||||
rsc: &mut TestResources,
|
||||
@@ -52,62 +71,93 @@ pub fn generate_keys<
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute_bootstrap_u128<Scalar>(params: NoiseSquashingTestParams<Scalar>)
|
||||
where
|
||||
Scalar: Numeric
|
||||
+ UnsignedTorus
|
||||
+ CastFrom<usize>
|
||||
+ CastInto<usize>
|
||||
+ Send
|
||||
+ Sync
|
||||
+ Serialize
|
||||
+ DeserializeOwned,
|
||||
NoiseSquashingTestParams<Scalar>: KeyCacheAccess<Keys = FftBootstrapKeys<Scalar>>,
|
||||
{
|
||||
let lwe_noise_distribution = params.glwe_noise_distribution;
|
||||
let glwe_dimension = params.glwe_dimension;
|
||||
let polynomial_size = params.polynomial_size;
|
||||
let ciphertext_modulus = params.ciphertext_modulus;
|
||||
pub fn execute_bootstrap_u128(
|
||||
squash_params: NoiseSquashingParameters,
|
||||
input_params: ClassicPBSParameters,
|
||||
) {
|
||||
let glwe_dimension = squash_params.glwe_dimension;
|
||||
let polynomial_size = squash_params.polynomial_size;
|
||||
let ciphertext_modulus = squash_params.ciphertext_modulus;
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
let mut keys_gen = |params| generate_keys(params, &mut rsc);
|
||||
let keys = gen_keys_or_get_from_cache_if_enabled(params, &mut keys_gen);
|
||||
let noise_squashing_test_params: NoiseSquashingTestParams<u128> = NoiseSquashingTestParams {
|
||||
lwe_dimension: LweDimension(input_params.lwe_dimension.0),
|
||||
glwe_dimension: squash_params.glwe_dimension,
|
||||
polynomial_size: squash_params.polynomial_size,
|
||||
lwe_noise_distribution: DynamicDistribution::new_t_uniform(46),
|
||||
glwe_noise_distribution: squash_params.glwe_noise_distribution,
|
||||
pbs_base_log: squash_params.decomp_base_log,
|
||||
pbs_level: squash_params.decomp_level_count,
|
||||
modulus_switch_noise_reduction_params: squash_params.modulus_switch_noise_reduction_params,
|
||||
ciphertext_modulus: squash_params.ciphertext_modulus,
|
||||
};
|
||||
|
||||
let mut keys_gen = |_params| generate_keys(noise_squashing_test_params, &mut rsc);
|
||||
let keys = gen_keys_or_get_from_cache_if_enabled(noise_squashing_test_params, &mut keys_gen);
|
||||
let (std_bootstrapping_key, small_lwe_sk, big_lwe_sk) =
|
||||
(keys.bsk, keys.small_lwe_sk, keys.big_lwe_sk);
|
||||
let output_lwe_dimension = big_lwe_sk.lwe_dimension();
|
||||
|
||||
let input_lwe_secret_key = LweSecretKey::from_container(
|
||||
small_lwe_sk
|
||||
.into_container()
|
||||
.iter()
|
||||
.copied()
|
||||
.map(|x| x as u64)
|
||||
.collect::<Vec<_>>(),
|
||||
);
|
||||
|
||||
let mut engine = ShortintEngine::new();
|
||||
|
||||
let modulus_switch_noise_reduction_key = squash_params
|
||||
.modulus_switch_noise_reduction_params
|
||||
.map(|modulus_switch_noise_reduction_params| {
|
||||
ModulusSwitchNoiseReductionKey::new(
|
||||
modulus_switch_noise_reduction_params,
|
||||
&input_lwe_secret_key,
|
||||
&mut engine,
|
||||
input_params.ciphertext_modulus,
|
||||
input_params.lwe_noise_distribution,
|
||||
)
|
||||
});
|
||||
let gpu_index = 0;
|
||||
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&std_bootstrapping_key, None, &stream);
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(
|
||||
&std_bootstrapping_key,
|
||||
modulus_switch_noise_reduction_key.as_ref(),
|
||||
&stream,
|
||||
);
|
||||
|
||||
// Our 4 bits message space
|
||||
let message_modulus: Scalar = Scalar::ONE << 4;
|
||||
|
||||
let message_modulus: u64 = 1 << 4;
|
||||
// Our input message
|
||||
let input_message: Scalar = 3usize.cast_into();
|
||||
let input_message: u64 = 3usize.cast_into();
|
||||
|
||||
// Delta used to encode 4 bits of message + a bit of padding on Scalar
|
||||
let delta: Scalar = (Scalar::ONE << (Scalar::BITS - 1)) / message_modulus;
|
||||
|
||||
let delta: u64 = (1 << (u64::BITS - 1)) / message_modulus;
|
||||
let delta_u128: u128 = (1 << (u128::BITS - 1)) / message_modulus as u128;
|
||||
|
||||
// Apply our encoding
|
||||
let plaintext = Plaintext(input_message * delta);
|
||||
|
||||
// Allocate a new LweCiphertext and encrypt our plaintext
|
||||
let lwe_ciphertext_in: LweCiphertextOwned<Scalar> = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&small_lwe_sk,
|
||||
let lwe_ciphertext_in: LweCiphertextOwned<u64> = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&input_lwe_secret_key,
|
||||
plaintext,
|
||||
lwe_noise_distribution,
|
||||
ciphertext_modulus,
|
||||
input_params.lwe_noise_distribution,
|
||||
input_params.ciphertext_modulus,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
let f = |x: Scalar| x;
|
||||
let accumulator: GlweCiphertextOwned<Scalar> = generate_programmable_bootstrap_glwe_lut(
|
||||
let f = |x: u128| x;
|
||||
let accumulator: GlweCiphertextOwned<u128> = generate_programmable_bootstrap_glwe_lut(
|
||||
polynomial_size,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
message_modulus.cast_into(),
|
||||
ciphertext_modulus,
|
||||
delta,
|
||||
delta_u128,
|
||||
f,
|
||||
);
|
||||
|
||||
@@ -136,8 +186,7 @@ where
|
||||
let pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
|
||||
|
||||
// Decrypt the PBS result
|
||||
let pbs_plaintext: Plaintext<Scalar> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
|
||||
|
||||
let pbs_plaintext: Plaintext<u128> = decrypt_lwe_ciphertext(&big_lwe_sk, &pbs_ct);
|
||||
// Create a SignedDecomposer to perform the rounding of the decrypted plaintext
|
||||
// We pass a DecompositionBaseLog of 5 and a DecompositionLevelCount of 1 indicating we want
|
||||
// to round the 5 MSB, 1 bit of padding plus our 4 bits of message
|
||||
@@ -145,12 +194,15 @@ where
|
||||
SignedDecomposer::new(DecompositionBaseLog(5), DecompositionLevelCount(1));
|
||||
|
||||
// Round and remove our encoding
|
||||
let pbs_result: Scalar = signed_decomposer.closest_representable(pbs_plaintext.0) / delta;
|
||||
let pbs_result: u128 = signed_decomposer.closest_representable(pbs_plaintext.0) / delta_u128;
|
||||
|
||||
assert_eq!(f(input_message), pbs_result);
|
||||
assert_eq!(f(input_message as u128), pbs_result);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bootstrap_u128_with_squashing() {
|
||||
execute_bootstrap_u128::<u128>(NOISESQUASHING128_U128_GPU_PARAMS);
|
||||
execute_bootstrap_u128(
|
||||
NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
);
|
||||
}
|
||||
|
||||
@@ -211,7 +211,7 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
lwe_array_out: &mut CudaVec<T>,
|
||||
test_vector: &CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<u64>,
|
||||
bootstrapping_key: &CudaVec<f64>,
|
||||
lwe_dimension: LweDimension,
|
||||
glwe_dimension: GlweDimension,
|
||||
|
||||
@@ -4,11 +4,14 @@ use crate::backward_compatibility::booleans::{
|
||||
};
|
||||
use crate::high_level_api::details::MaybeCloned;
|
||||
use crate::high_level_api::errors::UninitializedNoiseSquashing;
|
||||
use crate::high_level_api::global_state;
|
||||
use crate::high_level_api::global_state::with_internal_keys;
|
||||
use crate::high_level_api::global_state::{self, with_internal_keys};
|
||||
use crate::high_level_api::keys::InternalServerKey;
|
||||
use crate::high_level_api::traits::{FheDecrypt, SquashNoise};
|
||||
use crate::integer::ciphertext::SquashedNoiseBooleanBlock;
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::integer::gpu::ciphertext::CudaIntegerRadixCiphertext;
|
||||
use crate::named::Named;
|
||||
use crate::{ClientKey, Device, Tag};
|
||||
use serde::{Deserializer, Serializer};
|
||||
@@ -177,9 +180,40 @@ impl SquashNoise for FheBool {
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "gpu")]
|
||||
InternalServerKey::Cuda(_) => Err(crate::error!(
|
||||
"Cuda devices do not support noise squashing yet"
|
||||
)),
|
||||
InternalServerKey::Cuda(cuda_key) => {
|
||||
let streams = &cuda_key.streams;
|
||||
|
||||
let noise_squashing_key = cuda_key
|
||||
.key
|
||||
.noise_squashing_key
|
||||
.as_ref()
|
||||
.ok_or(UninitializedNoiseSquashing)?;
|
||||
|
||||
let cuda_block = CudaBooleanBlock(match self.ciphertext.on_gpu(streams) {
|
||||
MaybeCloned::Borrowed(gpu_ct) => {
|
||||
unsafe {
|
||||
// SAFETY
|
||||
// The gpu_ct is a ref, meaning it belongs to the thing
|
||||
// that is being iterated on, so it will stay alive for the
|
||||
// whole function
|
||||
gpu_ct.duplicate_async(streams)
|
||||
}
|
||||
}
|
||||
MaybeCloned::Cloned(gpu_ct) => gpu_ct,
|
||||
});
|
||||
let cuda_squashed_block = noise_squashing_key.squash_boolean_block_noise(
|
||||
cuda_key.pbs_key(),
|
||||
&cuda_block,
|
||||
streams,
|
||||
)?;
|
||||
let cpu_squashed_block =
|
||||
cuda_squashed_block.to_squashed_noise_boolean_block(streams);
|
||||
|
||||
Ok(SquashedNoiseFheBool {
|
||||
inner: InnerSquashedNoiseBoolean::Cpu(cpu_squashed_block),
|
||||
tag: cuda_key.tag.clone(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "hpu")]
|
||||
InternalServerKey::Hpu(_device) => {
|
||||
Err(crate::error!("Hpu devices do not support noise squashing"))
|
||||
|
||||
@@ -4,8 +4,7 @@ use crate::backward_compatibility::integers::{
|
||||
};
|
||||
use crate::high_level_api::details::MaybeCloned;
|
||||
use crate::high_level_api::errors::UninitializedNoiseSquashing;
|
||||
use crate::high_level_api::global_state;
|
||||
use crate::high_level_api::global_state::with_internal_keys;
|
||||
use crate::high_level_api::global_state::{self, with_internal_keys};
|
||||
use crate::high_level_api::keys::InternalServerKey;
|
||||
use crate::high_level_api::traits::{FheDecrypt, SquashNoise};
|
||||
use crate::integer::block_decomposition::{RecomposableFrom, SignExtendable};
|
||||
@@ -196,9 +195,27 @@ impl<Id: FheIntId> SquashNoise for FheInt<Id> {
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "gpu")]
|
||||
InternalServerKey::Cuda(_) => Err(crate::error!(
|
||||
"Cuda devices do not support noise squashing yet"
|
||||
)),
|
||||
InternalServerKey::Cuda(cuda_key) => {
|
||||
let streams = &cuda_key.streams;
|
||||
let noise_squashing_key = cuda_key
|
||||
.key
|
||||
.noise_squashing_key
|
||||
.as_ref()
|
||||
.ok_or(UninitializedNoiseSquashing)?;
|
||||
|
||||
let cuda_squashed_ct = noise_squashing_key.squash_signed_radix_ciphertext_noise(
|
||||
cuda_key.pbs_key(),
|
||||
&self.ciphertext.on_gpu(streams),
|
||||
streams,
|
||||
)?;
|
||||
|
||||
let cpu_squashed_ct =
|
||||
cuda_squashed_ct.to_squashed_noise_signed_radix_ciphertext(streams);
|
||||
Ok(SquashedNoiseFheInt {
|
||||
inner: InnerSquashedNoiseSignedRadixCiphertext::Cpu(cpu_squashed_ct),
|
||||
tag: cuda_key.tag.clone(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "hpu")]
|
||||
InternalServerKey::Hpu(_device) => {
|
||||
Err(crate::error!("Hpu devices do not support noise squashing"))
|
||||
|
||||
@@ -5,8 +5,7 @@ use crate::backward_compatibility::integers::{
|
||||
use crate::core_crypto::commons::numeric::UnsignedNumeric;
|
||||
use crate::high_level_api::details::MaybeCloned;
|
||||
use crate::high_level_api::errors::UninitializedNoiseSquashing;
|
||||
use crate::high_level_api::global_state;
|
||||
use crate::high_level_api::global_state::with_internal_keys;
|
||||
use crate::high_level_api::global_state::{self, with_internal_keys};
|
||||
use crate::high_level_api::keys::InternalServerKey;
|
||||
use crate::high_level_api::traits::{FheDecrypt, SquashNoise};
|
||||
use crate::integer::block_decomposition::RecomposableFrom;
|
||||
@@ -191,9 +190,26 @@ impl<Id: FheUintId> SquashNoise for FheUint<Id> {
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "gpu")]
|
||||
InternalServerKey::Cuda(_) => Err(crate::error!(
|
||||
"Cuda devices do not support noise squashing yet"
|
||||
)),
|
||||
InternalServerKey::Cuda(cuda_key) => {
|
||||
let streams = &cuda_key.streams;
|
||||
|
||||
let noise_squashing_key = cuda_key
|
||||
.key
|
||||
.noise_squashing_key
|
||||
.as_ref()
|
||||
.ok_or(UninitializedNoiseSquashing)?;
|
||||
|
||||
let cuda_squashed_ct = noise_squashing_key.squash_radix_ciphertext_noise(
|
||||
cuda_key.pbs_key(),
|
||||
&self.ciphertext.on_gpu(streams).ciphertext,
|
||||
streams,
|
||||
)?;
|
||||
let squashed_ct = cuda_squashed_ct.to_squashed_noise_radix_ciphertext(streams);
|
||||
Ok(SquashedNoiseFheUint {
|
||||
inner: InnerSquashedNoiseRadixCiphertext::Cpu(squashed_ct),
|
||||
tag: cuda_key.tag.clone(),
|
||||
})
|
||||
}
|
||||
#[cfg(feature = "hpu")]
|
||||
InternalServerKey::Hpu(_device) => {
|
||||
Err(crate::error!("Hpu devices do not support noise squashing"))
|
||||
|
||||
@@ -322,6 +322,8 @@ pub struct IntegerCudaServerKey {
|
||||
Option<crate::integer::gpu::list_compression::server_keys::CudaCompressionKey>,
|
||||
pub(crate) decompression_key:
|
||||
Option<crate::integer::gpu::list_compression::server_keys::CudaDecompressionKey>,
|
||||
pub(crate) noise_squashing_key:
|
||||
Option<crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey>,
|
||||
}
|
||||
|
||||
#[derive(Clone, serde::Serialize, serde::Deserialize, Versionize)]
|
||||
|
||||
@@ -341,6 +341,13 @@ impl CompressedServerKey {
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
let noise_squashing_key: Option<
|
||||
crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey,
|
||||
> = self
|
||||
.integer_key
|
||||
.noise_squashing_key
|
||||
.as_ref()
|
||||
.map(|noise_squashing_key| noise_squashing_key.decompress_to_cuda(&streams));
|
||||
synchronize_devices(streams.len() as u32);
|
||||
CudaServerKey {
|
||||
key: Arc::new(IntegerCudaServerKey {
|
||||
@@ -348,6 +355,7 @@ impl CompressedServerKey {
|
||||
cpk_key_switching_key_material,
|
||||
compression_key,
|
||||
decompression_key,
|
||||
noise_squashing_key,
|
||||
}),
|
||||
tag: self.tag.clone(),
|
||||
streams,
|
||||
|
||||
@@ -8,6 +8,7 @@ use crate::shortint::parameters::{
|
||||
NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
|
||||
use rand::prelude::*;
|
||||
|
||||
#[test]
|
||||
@@ -29,6 +30,74 @@ fn test_noise_squashing() {
|
||||
|
||||
let squashed = bitand.squash_noise().unwrap();
|
||||
|
||||
let recovered: U256 = squashed.decrypt(&cks);
|
||||
|
||||
assert_eq!(clear, recovered);
|
||||
|
||||
// Native unsigned
|
||||
let clear: u32 = rng.gen();
|
||||
let enc = FheUint32::encrypt(clear, &cks);
|
||||
let bitand = &enc & &enc;
|
||||
|
||||
let squashed = bitand.squash_noise().unwrap();
|
||||
|
||||
let recovered: u32 = squashed.decrypt(&cks);
|
||||
|
||||
assert_eq!(clear, recovered);
|
||||
|
||||
// Non native signed with proper input range
|
||||
let clear: i16 = rng.gen_range(-1 << 9..1 << 9);
|
||||
let enc = FheInt10::encrypt(clear, &cks);
|
||||
let bitand = &enc & &enc;
|
||||
|
||||
let squashed = bitand.squash_noise().unwrap();
|
||||
|
||||
let recovered: i16 = squashed.decrypt(&cks);
|
||||
assert_eq!(clear, recovered);
|
||||
|
||||
// Native signed
|
||||
let clear: i8 = rng.gen();
|
||||
let enc = FheInt8::encrypt(clear, &cks);
|
||||
let bitand = &enc & &enc;
|
||||
|
||||
let squashed = bitand.squash_noise().unwrap();
|
||||
|
||||
let recovered: i8 = squashed.decrypt(&cks);
|
||||
assert_eq!(clear, recovered);
|
||||
|
||||
// Booleans
|
||||
for clear in [false, true] {
|
||||
let enc = FheBool::encrypt(clear, &cks);
|
||||
let bitand = &enc & &enc;
|
||||
|
||||
let squashed = bitand.squash_noise().unwrap();
|
||||
|
||||
let recovered: bool = squashed.decrypt(&cks);
|
||||
assert_eq!(clear, recovered);
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "gpu")]
|
||||
#[test]
|
||||
fn test_gpu_noise_squashing() {
|
||||
let config = ConfigBuilder::with_custom_parameters(
|
||||
crate::shortint::parameters::PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.enable_noise_squashing(NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
|
||||
.build();
|
||||
let cks = crate::ClientKey::generate(config);
|
||||
let sks = cks.generate_compressed_server_key();
|
||||
|
||||
set_server_key(sks.decompress_to_gpu());
|
||||
|
||||
let mut rng = thread_rng();
|
||||
|
||||
// Non native type for clear
|
||||
let clear: U256 = rng.gen();
|
||||
let enc = FheUint256::encrypt(clear, &cks);
|
||||
let bitand = &enc & &enc;
|
||||
|
||||
let squashed = bitand.squash_noise().unwrap();
|
||||
|
||||
let recovered: U256 = squashed.decrypt(&cks);
|
||||
assert_eq!(clear, recovered);
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ pub mod boolean_value;
|
||||
pub mod compact_list;
|
||||
pub mod compressed_ciphertext_list;
|
||||
pub mod info;
|
||||
pub mod squashed_noise;
|
||||
|
||||
use crate::core_crypto::gpu::lwe_bootstrap_key::{
|
||||
prepare_cuda_ms_noise_reduction_key_ffi, CudaModulusSwitchNoiseReductionKey,
|
||||
|
||||
124
tfhe/src/integer/gpu/ciphertext/squashed_noise.rs
Normal file
124
tfhe/src/integer/gpu/ciphertext/squashed_noise.rs
Normal file
@@ -0,0 +1,124 @@
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::core_crypto::prelude::{LweCiphertextCount, LweCiphertextOwned, LweSize};
|
||||
use crate::integer::ciphertext::{
|
||||
SquashedNoiseBooleanBlock, SquashedNoiseRadixCiphertext, SquashedNoiseSignedRadixCiphertext,
|
||||
};
|
||||
use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
|
||||
use crate::shortint::ciphertext::{Degree, NoiseLevel, SquashedNoiseCiphertext};
|
||||
use crate::shortint::parameters::CoreCiphertextModulus;
|
||||
use crate::shortint::{AtomicPatternKind, CarryModulus, MessageModulus, PBSOrder};
|
||||
|
||||
pub struct CudaSquashedNoiseRadixCiphertext {
|
||||
pub packed_d_blocks: CudaLweCiphertextList<u128>,
|
||||
pub info: CudaRadixCiphertextInfo,
|
||||
pub original_block_count: usize,
|
||||
}
|
||||
|
||||
pub struct CudaSquashedNoiseUnsignedRadixCiphertext {
|
||||
pub ciphertext: CudaSquashedNoiseRadixCiphertext,
|
||||
}
|
||||
|
||||
pub struct CudaSquashedNoiseSignedRadixCiphertext {
|
||||
pub ciphertext: CudaSquashedNoiseRadixCiphertext,
|
||||
}
|
||||
|
||||
pub struct CudaSquashedNoiseBooleanBlock {
|
||||
pub ciphertext: CudaSquashedNoiseRadixCiphertext,
|
||||
}
|
||||
|
||||
impl CudaSquashedNoiseRadixCiphertext {
|
||||
pub(crate) fn new_zero(
|
||||
lwe_size: LweSize,
|
||||
lwe_ciphertext_count: LweCiphertextCount,
|
||||
ciphertext_modulus: CoreCiphertextModulus<u128>,
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
original_block_count: usize,
|
||||
streams: &CudaStreams,
|
||||
) -> Self {
|
||||
let mut blocks_info = Vec::with_capacity(lwe_ciphertext_count.0);
|
||||
|
||||
for _ in 0..lwe_ciphertext_count.0 {
|
||||
blocks_info.push(CudaBlockInfo {
|
||||
degree: Degree::new(0u64),
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
atomic_pattern: AtomicPatternKind::Standard(PBSOrder::KeyswitchBootstrap),
|
||||
noise_level: NoiseLevel::ZERO,
|
||||
});
|
||||
}
|
||||
Self {
|
||||
packed_d_blocks: CudaLweCiphertextList::<u128>::new(
|
||||
lwe_size.to_lwe_dimension(),
|
||||
lwe_ciphertext_count,
|
||||
ciphertext_modulus,
|
||||
streams,
|
||||
),
|
||||
info: CudaRadixCiphertextInfo {
|
||||
blocks: blocks_info,
|
||||
},
|
||||
original_block_count,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn to_squashed_noise_radix_ciphertext(
|
||||
&self,
|
||||
streams: &CudaStreams,
|
||||
) -> SquashedNoiseRadixCiphertext {
|
||||
let num_blocks = self.packed_d_blocks.lwe_ciphertext_count().0;
|
||||
let lwe_size = self.packed_d_blocks.lwe_dimension().to_lwe_size();
|
||||
let ct_modulus = self.packed_d_blocks.ciphertext_modulus();
|
||||
|
||||
let lwe_ct_list_cpu = self.packed_d_blocks.to_lwe_ciphertext_list(streams);
|
||||
let mut packed_blocks = Vec::<SquashedNoiseCiphertext>::with_capacity(num_blocks);
|
||||
lwe_ct_list_cpu
|
||||
.as_ref()
|
||||
.chunks(lwe_size.0)
|
||||
.enumerate()
|
||||
.for_each(|(i, block)| {
|
||||
let block = LweCiphertextOwned::from_container(block.to_vec(), ct_modulus);
|
||||
let info = self.info.blocks[i];
|
||||
packed_blocks.push(SquashedNoiseCiphertext::new(
|
||||
block,
|
||||
info.degree,
|
||||
info.message_modulus,
|
||||
info.carry_modulus,
|
||||
));
|
||||
});
|
||||
SquashedNoiseRadixCiphertext {
|
||||
packed_blocks,
|
||||
original_block_count: self.original_block_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CudaSquashedNoiseSignedRadixCiphertext {
|
||||
pub fn to_squashed_noise_signed_radix_ciphertext(
|
||||
&self,
|
||||
streams: &CudaStreams,
|
||||
) -> SquashedNoiseSignedRadixCiphertext {
|
||||
SquashedNoiseSignedRadixCiphertext {
|
||||
packed_blocks: self
|
||||
.ciphertext
|
||||
.to_squashed_noise_radix_ciphertext(streams)
|
||||
.packed_blocks,
|
||||
original_block_count: self.ciphertext.original_block_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CudaSquashedNoiseBooleanBlock {
|
||||
pub fn to_squashed_noise_boolean_block(
|
||||
&self,
|
||||
streams: &CudaStreams,
|
||||
) -> SquashedNoiseBooleanBlock {
|
||||
SquashedNoiseBooleanBlock {
|
||||
ciphertext: self
|
||||
.ciphertext
|
||||
.to_squashed_noise_radix_ciphertext(streams)
|
||||
.packed_blocks[0]
|
||||
.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2,6 +2,7 @@ pub mod ciphertext;
|
||||
pub mod client_key;
|
||||
pub mod key_switching_key;
|
||||
pub mod list_compression;
|
||||
pub mod noise_squashing;
|
||||
pub mod server_key;
|
||||
#[cfg(feature = "zk-pok")]
|
||||
pub mod zk;
|
||||
@@ -5800,3 +5801,126 @@ pub unsafe fn extend_radix_with_trivial_zero_blocks_msb_async(
|
||||
|
||||
update_noise_degree(output, &cuda_ffi_output);
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
/// # Safety
|
||||
///
|
||||
/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
|
||||
/// is required
|
||||
pub unsafe fn noise_squashing_async<T: UnsignedInteger, B: Numeric>(
|
||||
streams: &CudaStreams,
|
||||
output: &mut CudaSliceMut<T>,
|
||||
output_degrees: &mut Vec<u64>,
|
||||
output_noise_levels: &mut Vec<u64>,
|
||||
input: &CudaSlice<u64>,
|
||||
bootstrapping_key: &CudaVec<f64>,
|
||||
keyswitch_key: &CudaVec<u64>,
|
||||
lwe_dimension: LweDimension,
|
||||
glwe_dimension: GlweDimension,
|
||||
polynomial_size: PolynomialSize,
|
||||
input_glwe_dimension: GlweDimension,
|
||||
input_polynomial_size: PolynomialSize,
|
||||
ks_level: DecompositionLevelCount,
|
||||
ks_base_log: DecompositionBaseLog,
|
||||
pbs_level: DecompositionLevelCount,
|
||||
pbs_base_log: DecompositionBaseLog,
|
||||
num_blocks: u32,
|
||||
original_num_blocks: u32,
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
pbs_type: PBSType,
|
||||
grouping_factor: LweBskGroupingFactor,
|
||||
noise_reduction_key: Option<&CudaModulusSwitchNoiseReductionKey>,
|
||||
ct_modulus: f64,
|
||||
) {
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
input.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first input pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
input.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
output.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first output pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
output.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
bootstrapping_key.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first bsk pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
bootstrapping_key.gpu_index(0).get(),
|
||||
);
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
keyswitch_key.gpu_index(0),
|
||||
"GPU error: first stream is on GPU {}, first ksk pointer is on GPU {}",
|
||||
streams.gpu_indexes[0].get(),
|
||||
keyswitch_key.gpu_index(0).get(),
|
||||
);
|
||||
let ms_noise_reduction_key_ffi =
|
||||
prepare_cuda_ms_noise_reduction_key_ffi(noise_reduction_key, ct_modulus);
|
||||
|
||||
let allocate_ms_noise_array = noise_reduction_key.is_some();
|
||||
let mut mem_ptr: *mut i8 = std::ptr::null_mut();
|
||||
let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut(
|
||||
output,
|
||||
output_degrees,
|
||||
output_noise_levels,
|
||||
num_blocks,
|
||||
(glwe_dimension.0 * polynomial_size.0) as u32,
|
||||
);
|
||||
let cuda_ffi_input = prepare_cuda_radix_ffi_from_slice(
|
||||
input,
|
||||
output_degrees,
|
||||
output_noise_levels,
|
||||
original_num_blocks,
|
||||
(input_glwe_dimension.0 * input_polynomial_size.0) as u32,
|
||||
);
|
||||
|
||||
scratch_cuda_apply_noise_squashing_kb(
|
||||
streams.ptr.as_ptr(),
|
||||
streams.gpu_indexes_ptr(),
|
||||
streams.len() as u32,
|
||||
std::ptr::addr_of_mut!(mem_ptr),
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
input_glwe_dimension.0 as u32,
|
||||
input_polynomial_size.0 as u32,
|
||||
ks_level.0 as u32,
|
||||
ks_base_log.0 as u32,
|
||||
pbs_level.0 as u32,
|
||||
pbs_base_log.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
num_blocks,
|
||||
original_num_blocks,
|
||||
message_modulus.0 as u32,
|
||||
carry_modulus.0 as u32,
|
||||
pbs_type as u32,
|
||||
true,
|
||||
allocate_ms_noise_array,
|
||||
);
|
||||
|
||||
cuda_apply_noise_squashing_kb(
|
||||
streams.ptr.as_ptr(),
|
||||
streams.gpu_indexes_ptr(),
|
||||
streams.len() as u32,
|
||||
&raw mut cuda_ffi_output,
|
||||
&raw const cuda_ffi_input,
|
||||
mem_ptr,
|
||||
keyswitch_key.ptr.as_ptr(),
|
||||
&raw const ms_noise_reduction_key_ffi,
|
||||
bootstrapping_key.ptr.as_ptr(),
|
||||
);
|
||||
|
||||
cleanup_cuda_apply_noise_squashing_kb(
|
||||
streams.ptr.as_ptr(),
|
||||
streams.gpu_indexes_ptr(),
|
||||
streams.len() as u32,
|
||||
std::ptr::addr_of_mut!(mem_ptr),
|
||||
);
|
||||
}
|
||||
|
||||
140
tfhe/src/integer/gpu/noise_squashing/keys.rs
Normal file
140
tfhe/src/integer/gpu/noise_squashing/keys.rs
Normal file
@@ -0,0 +1,140 @@
|
||||
use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
|
||||
use crate::core_crypto::prelude::LweCiphertextCount;
|
||||
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
|
||||
use crate::integer::gpu::ciphertext::squashed_noise::{
|
||||
CudaSquashedNoiseBooleanBlock, CudaSquashedNoiseRadixCiphertext,
|
||||
CudaSquashedNoiseSignedRadixCiphertext,
|
||||
};
|
||||
use crate::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaSignedRadixCiphertext};
|
||||
use crate::integer::gpu::CudaServerKey;
|
||||
use crate::shortint::parameters::CoreCiphertextModulus;
|
||||
|
||||
use crate::shortint::{CarryModulus, MessageModulus};
|
||||
|
||||
pub struct CudaNoiseSquashingKey {
|
||||
pub bootstrapping_key: CudaLweBootstrapKey,
|
||||
pub message_modulus: MessageModulus,
|
||||
pub carry_modulus: CarryModulus,
|
||||
pub output_ciphertext_modulus: CoreCiphertextModulus<u128>,
|
||||
}
|
||||
|
||||
impl CudaNoiseSquashingKey {
|
||||
pub fn checked_squash_ciphertext_noise(
|
||||
&self,
|
||||
ciphertext: &CudaRadixCiphertext,
|
||||
src_server_key: &CudaServerKey,
|
||||
streams: &CudaStreams,
|
||||
) -> crate::Result<CudaSquashedNoiseRadixCiphertext> {
|
||||
for block in ciphertext.info.blocks.iter() {
|
||||
if src_server_key
|
||||
.max_noise_level
|
||||
.validate(block.noise_level)
|
||||
.is_err()
|
||||
{
|
||||
return Err(crate::error!(
|
||||
"squash_ciphertext_noise requires the input Ciphertext to have at most {:?} noise \
|
||||
got {:?}.",
|
||||
src_server_key.max_noise_level,
|
||||
block.noise_level
|
||||
));
|
||||
}
|
||||
if block.message_modulus != self.message_modulus {
|
||||
return Err(crate::error!(
|
||||
"Mismatched MessageModulus between Ciphertext {:?} and NoiseSquashingKey {:?}.",
|
||||
block.message_modulus,
|
||||
self.message_modulus,
|
||||
));
|
||||
}
|
||||
if block.carry_modulus != self.carry_modulus {
|
||||
return Err(crate::error!(
|
||||
"Mismatched CarryModulus between Ciphertext {:?} and NoiseSquashingKey {:?}.",
|
||||
block.carry_modulus,
|
||||
self.carry_modulus,
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(self.unchecked_squash_ciphertext_noise(ciphertext, src_server_key, streams))
|
||||
}
|
||||
|
||||
pub fn unchecked_squash_ciphertext_noise(
|
||||
&self,
|
||||
ciphertext: &CudaRadixCiphertext,
|
||||
src_server_key: &CudaServerKey,
|
||||
streams: &CudaStreams,
|
||||
) -> CudaSquashedNoiseRadixCiphertext {
|
||||
let original_block_count = ciphertext.d_blocks.lwe_ciphertext_count().0;
|
||||
let packed_size = ciphertext.d_blocks.lwe_ciphertext_count().0.div_ceil(2);
|
||||
let mut squashed_output = CudaSquashedNoiseRadixCiphertext::new_zero(
|
||||
self.bootstrapping_key.output_lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(packed_size),
|
||||
self.output_ciphertext_modulus,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
original_block_count,
|
||||
streams,
|
||||
);
|
||||
|
||||
src_server_key.apply_noise_squashing(&mut squashed_output, ciphertext, self, streams);
|
||||
|
||||
let mut new_block_info = Vec::<CudaBlockInfo>::with_capacity(packed_size);
|
||||
for (i, block) in squashed_output.info.blocks.iter().enumerate() {
|
||||
let block_info = squashed_output.info.blocks[i];
|
||||
new_block_info.push(CudaBlockInfo {
|
||||
degree: block.degree,
|
||||
message_modulus: block_info.message_modulus,
|
||||
carry_modulus: block_info.carry_modulus,
|
||||
atomic_pattern: block_info.atomic_pattern,
|
||||
noise_level: block_info.noise_level,
|
||||
});
|
||||
}
|
||||
CudaSquashedNoiseRadixCiphertext {
|
||||
packed_d_blocks: squashed_output.packed_d_blocks,
|
||||
info: CudaRadixCiphertextInfo {
|
||||
blocks: new_block_info,
|
||||
},
|
||||
original_block_count,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn squash_radix_ciphertext_noise(
|
||||
&self,
|
||||
src_server_key: &CudaServerKey,
|
||||
ciphertext: &CudaRadixCiphertext,
|
||||
streams: &CudaStreams,
|
||||
) -> crate::Result<CudaSquashedNoiseRadixCiphertext> {
|
||||
self.checked_squash_ciphertext_noise(ciphertext, src_server_key, streams)
|
||||
}
|
||||
|
||||
pub fn squash_signed_radix_ciphertext_noise(
|
||||
&self,
|
||||
src_server_key: &CudaServerKey,
|
||||
ciphertext: &CudaSignedRadixCiphertext,
|
||||
streams: &CudaStreams,
|
||||
) -> crate::Result<CudaSquashedNoiseSignedRadixCiphertext> {
|
||||
let squashed_output =
|
||||
self.checked_squash_ciphertext_noise(&ciphertext.ciphertext, src_server_key, streams)?;
|
||||
Ok(CudaSquashedNoiseSignedRadixCiphertext {
|
||||
ciphertext: squashed_output,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn squash_boolean_block_noise(
|
||||
&self,
|
||||
src_server_key: &CudaServerKey,
|
||||
ciphertext: &CudaBooleanBlock,
|
||||
streams: &CudaStreams,
|
||||
) -> crate::Result<CudaSquashedNoiseBooleanBlock> {
|
||||
let squashed_output = self.checked_squash_ciphertext_noise(
|
||||
&ciphertext.as_ref().ciphertext,
|
||||
src_server_key,
|
||||
streams,
|
||||
)?;
|
||||
Ok(CudaSquashedNoiseBooleanBlock {
|
||||
ciphertext: squashed_output,
|
||||
})
|
||||
}
|
||||
}
|
||||
2
tfhe/src/integer/gpu/noise_squashing/mod.rs
Normal file
2
tfhe/src/integer/gpu/noise_squashing/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
pub mod keys;
|
||||
pub mod noise_squashing_keys;
|
||||
32
tfhe/src/integer/gpu/noise_squashing/noise_squashing_keys.rs
Normal file
32
tfhe/src/integer/gpu/noise_squashing/noise_squashing_keys.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use super::keys::CudaNoiseSquashingKey;
|
||||
use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
|
||||
use crate::core_crypto::gpu::CudaStreams;
|
||||
use crate::integer::noise_squashing::CompressedNoiseSquashingKey;
|
||||
|
||||
impl CompressedNoiseSquashingKey {
|
||||
pub fn decompress_to_cuda(&self, streams: &CudaStreams) -> CudaNoiseSquashingKey {
|
||||
let std_bsk = self
|
||||
.key
|
||||
.bootstrapping_key()
|
||||
.as_view()
|
||||
.par_decompress_into_lwe_bootstrap_key();
|
||||
|
||||
let ms_noise_reduction_key = self
|
||||
.key
|
||||
.modulus_switch_noise_reduction_key()
|
||||
.as_ref()
|
||||
.map(|key| key.decompress());
|
||||
let bootstrapping_key = CudaLweBootstrapKey::from_lwe_bootstrap_key(
|
||||
&std_bsk,
|
||||
ms_noise_reduction_key.as_ref(),
|
||||
streams,
|
||||
);
|
||||
|
||||
CudaNoiseSquashingKey {
|
||||
bootstrapping_key,
|
||||
message_modulus: self.key.message_modulus(),
|
||||
carry_modulus: self.key.carry_modulus(),
|
||||
output_ciphertext_modulus: self.key.output_ciphertext_modulus(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -8,17 +8,19 @@ use crate::core_crypto::prelude::{
|
||||
use crate::integer::block_decomposition::{BlockDecomposer, DecomposableInto};
|
||||
use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
|
||||
use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
|
||||
use crate::integer::gpu::ciphertext::squashed_noise::CudaSquashedNoiseRadixCiphertext;
|
||||
use crate::integer::gpu::ciphertext::{
|
||||
CudaIntegerRadixCiphertext, CudaRadixCiphertext, CudaSignedRadixCiphertext,
|
||||
CudaUnsignedRadixCiphertext,
|
||||
};
|
||||
use crate::integer::gpu::noise_squashing::keys::CudaNoiseSquashingKey;
|
||||
use crate::integer::gpu::server_key::CudaBootstrappingKey;
|
||||
use crate::integer::gpu::{
|
||||
add_and_propagate_single_carry_assign_async, apply_bivariate_lut_kb_async,
|
||||
apply_many_univariate_lut_kb_async, apply_univariate_lut_kb_async,
|
||||
compute_prefix_sum_hillis_steele_async, extend_radix_with_trivial_zero_blocks_msb_async,
|
||||
full_propagate_assign_async, propagate_single_carry_assign_async, trim_radix_blocks_lsb_async,
|
||||
CudaServerKey, PBSType,
|
||||
full_propagate_assign_async, noise_squashing_async, propagate_single_carry_assign_async,
|
||||
trim_radix_blocks_lsb_async, CudaServerKey, PBSType,
|
||||
};
|
||||
use crate::integer::server_key::radix_parallel::OutputFlag;
|
||||
use crate::shortint::ciphertext::{Degree, NoiseLevel};
|
||||
@@ -1766,4 +1768,86 @@ impl CudaServerKey {
|
||||
* size_of::<u64>()
|
||||
* ct.as_ref().d_blocks.lwe_dimension().0) as u64
|
||||
}
|
||||
|
||||
pub(crate) fn apply_noise_squashing(
|
||||
&self,
|
||||
output: &mut CudaSquashedNoiseRadixCiphertext,
|
||||
input: &CudaRadixCiphertext,
|
||||
squashing_key: &CudaNoiseSquashingKey,
|
||||
streams: &CudaStreams,
|
||||
) {
|
||||
unsafe {
|
||||
self.apply_noise_squashing_async(output, input, squashing_key, streams);
|
||||
}
|
||||
streams.synchronize();
|
||||
}
|
||||
|
||||
/// Applies the lookup table on the range of ciphertexts
|
||||
///
|
||||
/// The output must have exactly block_range.len() blocks
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
|
||||
/// not be dropped until streams is synchronised
|
||||
pub(crate) unsafe fn apply_noise_squashing_async(
|
||||
&self,
|
||||
output: &mut CudaSquashedNoiseRadixCiphertext,
|
||||
input: &CudaRadixCiphertext,
|
||||
squashing_key: &CudaNoiseSquashingKey,
|
||||
streams: &CudaStreams,
|
||||
) {
|
||||
let num_output_blocks = output.packed_d_blocks.lwe_ciphertext_count().0;
|
||||
|
||||
let mut output_degrees = vec![0_u64; num_output_blocks];
|
||||
let mut output_noise_levels = vec![0_u64; num_output_blocks];
|
||||
let input_slice = input.d_blocks.0.d_vec.as_slice(.., 0).unwrap();
|
||||
let mut output_slice = output.packed_d_blocks.0.d_vec.as_mut_slice(.., 0).unwrap();
|
||||
let ct_modulus = input.d_blocks.ciphertext_modulus().raw_modulus_float();
|
||||
let d_bsk = &squashing_key.bootstrapping_key;
|
||||
let (input_glwe_dimension, input_polynomial_size) = match &self.bootstrapping_key {
|
||||
CudaBootstrappingKey::Classic(d_bsk) => {
|
||||
(d_bsk.glwe_dimension(), d_bsk.polynomial_size())
|
||||
}
|
||||
CudaBootstrappingKey::MultiBit(d_multibit_bsk) => (
|
||||
d_multibit_bsk.glwe_dimension(),
|
||||
d_multibit_bsk.polynomial_size(),
|
||||
),
|
||||
};
|
||||
unsafe {
|
||||
noise_squashing_async::<u128, f64>(
|
||||
streams,
|
||||
&mut output_slice,
|
||||
&mut output_degrees,
|
||||
&mut output_noise_levels,
|
||||
&input_slice,
|
||||
&d_bsk.d_vec,
|
||||
&self.key_switching_key.d_vec,
|
||||
self.key_switching_key
|
||||
.output_key_lwe_size()
|
||||
.to_lwe_dimension(),
|
||||
d_bsk.glwe_dimension,
|
||||
d_bsk.polynomial_size,
|
||||
input_glwe_dimension,
|
||||
input_polynomial_size,
|
||||
self.key_switching_key.decomposition_level_count(),
|
||||
self.key_switching_key.decomposition_base_log(),
|
||||
d_bsk.decomp_level_count,
|
||||
d_bsk.decomp_base_log,
|
||||
num_output_blocks as u32,
|
||||
input.d_blocks.lwe_ciphertext_count().0 as u32,
|
||||
self.message_modulus,
|
||||
self.carry_modulus,
|
||||
PBSType::Classical,
|
||||
LweBskGroupingFactor(0),
|
||||
d_bsk.d_ms_noise_reduction_key.as_ref(),
|
||||
ct_modulus,
|
||||
);
|
||||
}
|
||||
|
||||
for (i, info) in output.info.blocks.iter_mut().enumerate() {
|
||||
info.degree = Degree(output_degrees[i]);
|
||||
info.noise_level = NoiseLevel(output_noise_levels[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,20 @@ pub struct SquashedNoiseCiphertext {
|
||||
}
|
||||
|
||||
impl SquashedNoiseCiphertext {
|
||||
#[allow(dead_code)]
|
||||
pub(crate) fn new(
|
||||
ct: LweCiphertextOwned<u128>,
|
||||
degree: Degree,
|
||||
message_modulus: MessageModulus,
|
||||
carry_modulus: CarryModulus,
|
||||
) -> Self {
|
||||
Self {
|
||||
ct,
|
||||
degree,
|
||||
message_modulus,
|
||||
carry_modulus,
|
||||
}
|
||||
}
|
||||
pub(crate) fn new_zero(
|
||||
lwe_size: LweSize,
|
||||
ciphertext_modulus: CoreCiphertextModulus<u128>,
|
||||
|
||||
@@ -25,6 +25,17 @@ pub struct CompressedNoiseSquashingKey {
|
||||
output_ciphertext_modulus: CoreCiphertextModulus<u128>,
|
||||
}
|
||||
|
||||
impl CompressedNoiseSquashingKey {
|
||||
pub fn bootstrapping_key(&self) -> &SeededLweBootstrapKeyOwned<u128> {
|
||||
&self.bootstrapping_key
|
||||
}
|
||||
|
||||
pub fn modulus_switch_noise_reduction_key(
|
||||
&self,
|
||||
) -> Option<&CompressedModulusSwitchNoiseReductionKey<u64>> {
|
||||
self.modulus_switch_noise_reduction_key.as_ref()
|
||||
}
|
||||
}
|
||||
impl ClientKey {
|
||||
pub fn new_compressed_noise_squashing_key(
|
||||
&self,
|
||||
|
||||
Reference in New Issue
Block a user