mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-09 14:47:56 -05:00
chore(gpu): add a benchmark for 128-bit multi-bit noise squashing
- Also, remove the lut indexes concept from the 128-bit multi-bit pbs. It's assumed not to exist by the entire backend (as it doesn't for classical PBS). So to keep it here would be a bit error prone.
This commit is contained in:
@@ -1066,8 +1066,18 @@ template <typename InputTorus> struct int_noise_squashing_lut {
|
||||
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
|
||||
tmp_lwe_before_ks, gpu_memory_allocated);
|
||||
for (int i = 0; i < pbs_buffer.size(); i++) {
|
||||
cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
|
||||
&pbs_buffer[i]);
|
||||
switch (params.pbs_type) {
|
||||
case MULTI_BIT:
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap_128(
|
||||
streams[i], gpu_indexes[i], &pbs_buffer[i]);
|
||||
break;
|
||||
case CLASSICAL:
|
||||
cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
|
||||
&pbs_buffer[i]);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (PBS): unknown PBS type. ")
|
||||
}
|
||||
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
|
||||
}
|
||||
if (lwe_aligned_gather_vec.size() > 0) {
|
||||
|
||||
@@ -47,12 +47,11 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
void const *lwe_array_in, void const *lwe_input_indexes,
|
||||
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
|
||||
const uint32_t gpu_index,
|
||||
|
||||
@@ -347,18 +347,12 @@ void execute_pbs_async(
|
||||
auto current_lwe_input_indexes =
|
||||
get_variant_element(lwe_input_indexes, i);
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
streams[i], gpu_indexes[i], current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_inputs_on_gpu, num_many_lut, lut_stride);
|
||||
current_lwe_output_indexes, lut_vec[i], current_lwe_array_in,
|
||||
current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i],
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
base_log, level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
|
||||
}
|
||||
break;
|
||||
case CLASSICAL:
|
||||
|
||||
@@ -120,8 +120,8 @@ template <typename InputTorus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -131,45 +131,45 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
case 256:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -182,8 +182,8 @@ template <typename InputTorus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -194,46 +194,46 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, grouping_factor, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
@@ -245,12 +245,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
void const *lwe_array_in, void const *lwe_input_indexes,
|
||||
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
|
||||
@@ -263,7 +262,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
@@ -276,7 +274,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
|
||||
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const __uint128_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint64_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
|
||||
|
||||
@@ -136,7 +136,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_input_indexes,
|
||||
const __uint128_t *__restrict__ lut_vector,
|
||||
const InputTorus *__restrict__ lut_vector_indexes,
|
||||
__uint128_t *global_accumulator, double *global_accumulator_fft,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
@@ -169,8 +168,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
auto block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
auto block_lut_vector = lut_vector;
|
||||
|
||||
auto global_slice =
|
||||
&global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
|
||||
@@ -368,7 +366,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
__uint128_t *lwe_array_out,
|
||||
const InputTorus *__restrict__ lwe_output_indexes,
|
||||
const __uint128_t *__restrict__ lut_vector,
|
||||
const InputTorus *__restrict__ lut_vector_indexes,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_input_indexes,
|
||||
const double *__restrict__ keybundle_array, double *join_buffer,
|
||||
@@ -409,8 +406,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
auto block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] *
|
||||
params::degree * (glwe_dimension + 1)];
|
||||
auto block_lut_vector = lut_vector;
|
||||
|
||||
auto block_join_buffer =
|
||||
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
|
||||
@@ -591,8 +587,7 @@ __host__ void execute_compute_keybundle_128(
|
||||
template <typename InputTorus, class params, bool is_first_iter>
|
||||
__host__ void execute_step_one_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count) {
|
||||
@@ -618,27 +613,26 @@ __host__ void execute_step_one_128(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
|
||||
InputTorus, params, NOSM, is_first_iter>
|
||||
<<<grid_accumulate_step_one, thds, 0, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
|
||||
global_accumulator, global_accumulator_fft, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
lwe_array_in, lwe_input_indexes, lut_vector, global_accumulator,
|
||||
global_accumulator_fft, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, d_mem,
|
||||
full_sm_accumulate_step_one);
|
||||
else if (max_shared_memory < full_sm_accumulate_step_one)
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
|
||||
InputTorus, params, PARTIALSM, is_first_iter>
|
||||
<<<grid_accumulate_step_one, thds, partial_sm_accumulate_step_one,
|
||||
stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
|
||||
lut_vector_indexes, global_accumulator,
|
||||
global_accumulator_fft, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, d_mem,
|
||||
partial_sm_accumulate_step_one);
|
||||
global_accumulator, global_accumulator_fft, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count,
|
||||
d_mem, partial_sm_accumulate_step_one);
|
||||
else
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
|
||||
InputTorus, params, FULLSM, is_first_iter>
|
||||
<<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
|
||||
stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
|
||||
lut_vector_indexes, global_accumulator,
|
||||
global_accumulator_fft, lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, d_mem, 0);
|
||||
global_accumulator, global_accumulator_fft, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count,
|
||||
d_mem, 0);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -691,8 +685,8 @@ template <typename InputTorus, class params>
|
||||
__host__ void host_multi_bit_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -717,14 +711,14 @@ __host__ void host_multi_bit_programmable_bootstrap_128(
|
||||
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
|
||||
if (is_first_iter) {
|
||||
execute_step_one_128<InputTorus, params, true>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count);
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count);
|
||||
} else {
|
||||
execute_step_one_128<InputTorus, params, false>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count);
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
base_log, level_count);
|
||||
}
|
||||
|
||||
if (is_last_iter) {
|
||||
@@ -745,9 +739,8 @@ __host__ void host_multi_bit_programmable_bootstrap_128(
|
||||
template <typename InputTorus, class params>
|
||||
__host__ void execute_cg_external_product_loop_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t *lwe_array_out, InputTorus const *lwe_output_indexes,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
|
||||
@@ -780,46 +773,45 @@ __host__ void execute_cg_external_product_loop_128(
|
||||
auto global_accumulator = buffer->global_accumulator;
|
||||
auto join_buffer = buffer->global_join_buffer;
|
||||
|
||||
void *kernel_args[22];
|
||||
void *kernel_args[21];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lwe_output_indexes;
|
||||
kernel_args[2] = &lut_vector;
|
||||
kernel_args[3] = &lut_vector_indexes;
|
||||
kernel_args[4] = &lwe_array_in;
|
||||
kernel_args[5] = &lwe_input_indexes;
|
||||
kernel_args[6] = &keybundle_fft;
|
||||
kernel_args[7] = &join_buffer;
|
||||
kernel_args[8] = &global_accumulator;
|
||||
kernel_args[9] = &lwe_dimension;
|
||||
kernel_args[10] = &glwe_dimension;
|
||||
kernel_args[11] = &polynomial_size;
|
||||
kernel_args[12] = &base_log;
|
||||
kernel_args[13] = &level_count;
|
||||
kernel_args[14] = &grouping_factor;
|
||||
kernel_args[15] = &lwe_offset;
|
||||
kernel_args[16] = &chunk_size;
|
||||
kernel_args[17] = &keybundle_size_per_input;
|
||||
kernel_args[18] = &d_mem;
|
||||
kernel_args[20] = &num_many_lut;
|
||||
kernel_args[21] = &lut_stride;
|
||||
kernel_args[3] = &lwe_array_in;
|
||||
kernel_args[4] = &lwe_input_indexes;
|
||||
kernel_args[5] = &keybundle_fft;
|
||||
kernel_args[6] = &join_buffer;
|
||||
kernel_args[7] = &global_accumulator;
|
||||
kernel_args[8] = &lwe_dimension;
|
||||
kernel_args[9] = &glwe_dimension;
|
||||
kernel_args[10] = &polynomial_size;
|
||||
kernel_args[11] = &base_log;
|
||||
kernel_args[12] = &level_count;
|
||||
kernel_args[13] = &grouping_factor;
|
||||
kernel_args[14] = &lwe_offset;
|
||||
kernel_args[15] = &chunk_size;
|
||||
kernel_args[16] = &keybundle_size_per_input;
|
||||
kernel_args[17] = &d_mem;
|
||||
kernel_args[19] = &num_many_lut;
|
||||
kernel_args[20] = &lut_stride;
|
||||
|
||||
dim3 grid_accumulate(num_samples, glwe_dimension + 1, level_count);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
if (max_shared_memory < partial_dm) {
|
||||
kernel_args[19] = &full_dm;
|
||||
kernel_args[18] = &full_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
InputTorus, params, NOSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, 0, stream));
|
||||
} else if (max_shared_memory < full_dm) {
|
||||
kernel_args[19] = &partial_dm;
|
||||
kernel_args[18] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
InputTorus, params, PARTIALSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
|
||||
} else {
|
||||
kernel_args[19] = &no_dm;
|
||||
kernel_args[18] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
|
||||
InputTorus, params, FULLSM>,
|
||||
@@ -831,8 +823,8 @@ template <typename InputTorus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap_128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
|
||||
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
|
||||
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
|
||||
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
|
||||
__uint128_t const *bootstrapping_key,
|
||||
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
@@ -851,11 +843,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap_128(
|
||||
|
||||
// Accumulate
|
||||
execute_cg_external_product_loop_128<InputTorus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
|
||||
lut_stride);
|
||||
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
|
||||
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
lwe_offset, num_many_lut, lut_stride);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2698,7 +2698,6 @@ unsafe extern "C" {
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_output_indexes: *const ffi::c_void,
|
||||
lut_vector: *const ffi::c_void,
|
||||
lut_vector_indexes: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
|
||||
use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use benchmark::params_aliases::{
|
||||
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
#[cfg(not(feature = "gpu"))]
|
||||
@@ -25,8 +29,12 @@ use tfhe::prelude::*;
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::core_crypto::gpu::get_number_of_gpus;
|
||||
use tfhe::shortint::parameters::{
|
||||
CompressionParameters, NoiseSquashingCompressionParameters, NoiseSquashingParameters,
|
||||
};
|
||||
use tfhe::shortint::PBSParameters;
|
||||
#[cfg(feature = "gpu")]
|
||||
use tfhe::{set_server_key, GpuIndex};
|
||||
use tfhe::GpuIndex;
|
||||
use tfhe::{
|
||||
ClientKey, CompressedCiphertextListBuilder, CompressedServerKey,
|
||||
CompressedSquashedNoiseCiphertextListBuilder, FheUint10, FheUint12, FheUint128, FheUint14,
|
||||
@@ -36,18 +44,42 @@ use tfhe::{
|
||||
|
||||
fn bench_sns_only_fhe_type<FheType>(
|
||||
c: &mut Criterion,
|
||||
client_key: &ClientKey,
|
||||
params: (
|
||||
PBSParameters,
|
||||
NoiseSquashingParameters,
|
||||
NoiseSquashingCompressionParameters,
|
||||
CompressionParameters,
|
||||
),
|
||||
type_name: &str,
|
||||
num_bits: usize,
|
||||
) where
|
||||
FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
|
||||
FheType: SquashNoise,
|
||||
{
|
||||
let (param, noise_param, _, _) = params;
|
||||
|
||||
use tfhe::{set_server_key, ConfigBuilder};
|
||||
let config = ConfigBuilder::with_custom_parameters(param)
|
||||
.enable_noise_squashing(noise_param)
|
||||
.build();
|
||||
let client_key = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&client_key);
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
set_server_key(compressed_sks.decompress_to_gpu());
|
||||
|
||||
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
|
||||
{
|
||||
let decompressed_sks = compressed_sks.decompress();
|
||||
rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
|
||||
set_server_key(decompressed_sks);
|
||||
}
|
||||
|
||||
let mut bench_group = c.benchmark_group(type_name);
|
||||
let bench_id_prefix = if cfg!(feature = "gpu") {
|
||||
"hlapi::cuda"
|
||||
format!("hlapi::cuda::{}", noise_param.name())
|
||||
} else {
|
||||
"hlapi"
|
||||
"hlapi".to_string()
|
||||
};
|
||||
let bench_id_suffix = format!("noise_squash::{type_name}");
|
||||
|
||||
@@ -60,9 +92,9 @@ fn bench_sns_only_fhe_type<FheType>(
|
||||
bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
configure_gpu(client_key);
|
||||
configure_gpu(&client_key);
|
||||
|
||||
let input = FheType::encrypt(rng.gen(), client_key);
|
||||
let input = FheType::encrypt(rng.gen(), &client_key);
|
||||
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
b.iter(|| {
|
||||
@@ -82,7 +114,7 @@ fn bench_sns_only_fhe_type<FheType>(
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
println!("elements: {elements}");
|
||||
let gpu_count = get_number_of_gpus() as usize;
|
||||
let compressed_server_key = CompressedServerKey::new(client_key);
|
||||
let compressed_server_key = CompressedServerKey::new(&client_key);
|
||||
let sks_vec = (0..gpu_count)
|
||||
.map(|i| {
|
||||
compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
|
||||
@@ -92,7 +124,7 @@ fn bench_sns_only_fhe_type<FheType>(
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let encrypt_values = || {
|
||||
(0..elements)
|
||||
.map(|_| FheType::encrypt(rng.gen(), client_key))
|
||||
.map(|_| FheType::encrypt(rng.gen(), &client_key))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -118,7 +150,7 @@ fn bench_sns_only_fhe_type<FheType>(
|
||||
bench_group.bench_function(&bench_id, |b| {
|
||||
let encrypt_values = || {
|
||||
(0..elements)
|
||||
.map(|_| FheType::encrypt(rng.gen(), client_key))
|
||||
.map(|_| FheType::encrypt(rng.gen(), &client_key))
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
@@ -150,7 +182,12 @@ fn bench_sns_only_fhe_type<FheType>(
|
||||
|
||||
fn bench_decomp_sns_comp_fhe_type<FheType>(
|
||||
c: &mut Criterion,
|
||||
client_key: &ClientKey,
|
||||
params: (
|
||||
PBSParameters,
|
||||
NoiseSquashingParameters,
|
||||
NoiseSquashingCompressionParameters,
|
||||
CompressionParameters,
|
||||
),
|
||||
type_name: &str,
|
||||
num_bits: usize,
|
||||
) where
|
||||
@@ -158,11 +195,32 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
|
||||
FheType: SquashNoise + Tagged + HlExpandable + HlCompressible,
|
||||
<FheType as SquashNoise>::Output: HlSquashedNoiseCompressible,
|
||||
{
|
||||
let (param, noise_param, comp_noise_param, comp_param) = params;
|
||||
|
||||
use tfhe::{set_server_key, ConfigBuilder};
|
||||
let config = ConfigBuilder::with_custom_parameters(param)
|
||||
.enable_noise_squashing(noise_param)
|
||||
.enable_noise_squashing_compression(comp_noise_param)
|
||||
.enable_compression(comp_param)
|
||||
.build();
|
||||
let client_key = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&client_key);
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
set_server_key(compressed_sks.decompress_to_gpu());
|
||||
|
||||
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
|
||||
{
|
||||
let decompressed_sks = compressed_sks.decompress();
|
||||
rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
|
||||
set_server_key(decompressed_sks);
|
||||
}
|
||||
|
||||
let mut bench_group = c.benchmark_group(type_name);
|
||||
let bench_id_prefix = if cfg!(feature = "gpu") {
|
||||
"hlapi::cuda"
|
||||
format!("hlapi::cuda::{}", noise_param.name())
|
||||
} else {
|
||||
"hlapi"
|
||||
"hlapi".to_string()
|
||||
};
|
||||
let bench_id_suffix = format!("decomp_noise_squash_comp::{type_name}");
|
||||
|
||||
@@ -175,9 +233,9 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
|
||||
bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
|
||||
|
||||
#[cfg(feature = "gpu")]
|
||||
configure_gpu(client_key);
|
||||
configure_gpu(&client_key);
|
||||
|
||||
let input = FheType::encrypt(rng.gen(), client_key);
|
||||
let input = FheType::encrypt(rng.gen(), &client_key);
|
||||
|
||||
let mut builder = CompressedCiphertextListBuilder::new();
|
||||
builder.push(input);
|
||||
@@ -205,7 +263,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
|
||||
bench_group.throughput(Throughput::Elements(elements));
|
||||
println!("elements: {elements}");
|
||||
let gpu_count = get_number_of_gpus() as usize;
|
||||
let compressed_server_key = CompressedServerKey::new(client_key);
|
||||
let compressed_server_key = CompressedServerKey::new(&client_key);
|
||||
let sks_vec = (0..gpu_count)
|
||||
.map(|i| {
|
||||
compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
|
||||
@@ -216,7 +274,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
|
||||
let compressed_values = || {
|
||||
(0..elements)
|
||||
.map(|_| {
|
||||
let input = FheType::encrypt(rng.gen(), client_key);
|
||||
let input = FheType::encrypt(rng.gen(), &client_key);
|
||||
let mut builder = CompressedCiphertextListBuilder::new();
|
||||
builder.push(input);
|
||||
builder.build().unwrap()
|
||||
@@ -254,7 +312,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
|
||||
let compressed_values = || {
|
||||
(0..elements)
|
||||
.map(|_| {
|
||||
let input = FheType::encrypt(rng.gen(), client_key);
|
||||
let input = FheType::encrypt(rng.gen(), &client_key);
|
||||
let mut builder = CompressedCiphertextListBuilder::new();
|
||||
builder.push(input);
|
||||
builder.build().unwrap()
|
||||
@@ -296,8 +354,10 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
|
||||
macro_rules! bench_sns_only_type {
|
||||
($fhe_type:ident) => {
|
||||
::paste::paste! {
|
||||
fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
|
||||
bench_sns_only_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
|
||||
fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, params: &[(PBSParameters, NoiseSquashingParameters, NoiseSquashingCompressionParameters, CompressionParameters)]) {
|
||||
for param in params {
|
||||
bench_sns_only_fhe_type::<$fhe_type>(c, *param, stringify!($fhe_type), $fhe_type::num_bits());
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -306,8 +366,10 @@ macro_rules! bench_sns_only_type {
|
||||
macro_rules! bench_decomp_sns_comp_type {
|
||||
($fhe_type:ident) => {
|
||||
::paste::paste! {
|
||||
fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
|
||||
bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
|
||||
fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, params: &[(PBSParameters, NoiseSquashingParameters, NoiseSquashingCompressionParameters, CompressionParameters)]) {
|
||||
for param in params {
|
||||
bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, *param, stringify!($fhe_type), $fhe_type::num_bits());
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -330,65 +392,55 @@ bench_decomp_sns_comp_type!(FheUint64);
|
||||
fn main() {
|
||||
#[cfg(feature = "hpu")]
|
||||
panic!("Noise squashing is not supported on HPU");
|
||||
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
|
||||
let cks = {
|
||||
use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
use tfhe::{set_server_key, ConfigBuilder};
|
||||
let config = ConfigBuilder::with_custom_parameters(
|
||||
BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
|
||||
.enable_noise_squashing_compression(
|
||||
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.enable_compression(BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
|
||||
.build();
|
||||
let cks = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&cks);
|
||||
|
||||
let decompressed_sks = compressed_sks.decompress();
|
||||
rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
|
||||
set_server_key(decompressed_sks);
|
||||
cks
|
||||
};
|
||||
#[cfg(feature = "gpu")]
|
||||
let cks = {
|
||||
use tfhe::{set_server_key, ConfigBuilder};
|
||||
let config = ConfigBuilder::with_custom_parameters(
|
||||
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.enable_noise_squashing(
|
||||
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.enable_noise_squashing_compression(
|
||||
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.enable_compression(
|
||||
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)
|
||||
.build();
|
||||
let cks = ClientKey::generate(config);
|
||||
let compressed_sks = CompressedServerKey::new(&cks);
|
||||
let params: Vec<(
|
||||
PBSParameters,
|
||||
NoiseSquashingParameters,
|
||||
NoiseSquashingCompressionParameters,
|
||||
CompressionParameters,
|
||||
)> = {
|
||||
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
|
||||
{
|
||||
vec![(
|
||||
BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
|
||||
BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
)]
|
||||
}
|
||||
|
||||
set_server_key(compressed_sks.decompress_to_gpu());
|
||||
cks
|
||||
#[cfg(feature = "gpu")]
|
||||
{
|
||||
vec![(
|
||||
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
|
||||
BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
), (
|
||||
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
|
||||
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
),
|
||||
]
|
||||
}
|
||||
};
|
||||
|
||||
let mut c = Criterion::default().configure_from_args();
|
||||
|
||||
bench_sns_only_fhe_uint2(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint4(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint6(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint8(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint10(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint12(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint14(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint16(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint32(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint64(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint128(&mut c, &cks);
|
||||
bench_sns_only_fhe_uint2(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint4(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint6(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint8(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint10(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint12(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint14(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint16(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint32(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint64(&mut c, params.as_slice());
|
||||
bench_sns_only_fhe_uint128(&mut c, params.as_slice());
|
||||
|
||||
bench_decomp_sns_comp_fhe_uint64(&mut c, &cks);
|
||||
bench_decomp_sns_comp_fhe_uint64(&mut c, params.as_slice());
|
||||
|
||||
c.final_summary();
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ pub mod shortint_params_aliases {
|
||||
ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
|
||||
MultiBitPBSParameters, NoiseSquashingCompressionParameters, NoiseSquashingParameters,
|
||||
ShortintKeySwitchingParameters,
|
||||
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
// KS PBS Gaussian
|
||||
pub const BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128: ClassicPBSParameters =
|
||||
@@ -143,6 +144,10 @@ pub mod shortint_params_aliases {
|
||||
NoiseSquashingParameters =
|
||||
V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
pub const BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
|
||||
NoiseSquashingParameters =
|
||||
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
pub const BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
|
||||
NoiseSquashingCompressionParameters =
|
||||
V1_4_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
|
||||
|
||||
@@ -140,13 +140,8 @@ fn execute_multibit_bootstrap_u128(
|
||||
let d_accumulator =
|
||||
CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
|
||||
|
||||
let test_vector_indexes: Vec<u64> = vec![0; par_lwe_list.lwe_ciphertext_count().0];
|
||||
|
||||
let mut d_test_vector_indexes =
|
||||
unsafe { CudaVec::<u64>::new_async(number_of_messages, &stream, 0) };
|
||||
unsafe {
|
||||
d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0)
|
||||
};
|
||||
// We initialize it so cargo won't complain, but we don't use it internally
|
||||
let d_test_vector_indexes = unsafe { CudaVec::<u64>::new_async(1, &stream, 0) };
|
||||
|
||||
let num_blocks = d_lwe_ciphertext_in.lwe_ciphertext_count().0;
|
||||
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
|
||||
|
||||
@@ -399,7 +399,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
output_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
input_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
|
||||
@@ -4,9 +4,12 @@ use crate::high_level_api::{
|
||||
};
|
||||
use crate::integer::U256;
|
||||
use crate::set_server_key;
|
||||
#[cfg(feature = "gpu")]
|
||||
use crate::shortint::parameters::{
|
||||
NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
use crate::shortint::parameters::{
|
||||
NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
};
|
||||
|
||||
@@ -32,8 +32,8 @@ pub enum CudaBootstrappingKey<Scalar: UnsignedInteger> {
|
||||
impl<Scalar: UnsignedInteger> CudaBootstrappingKey<Scalar> {
|
||||
pub(crate) fn output_lwe_dimension(&self) -> LweDimension {
|
||||
match self {
|
||||
CudaBootstrappingKey::Classic(bsk) => bsk.output_lwe_dimension(),
|
||||
CudaBootstrappingKey::MultiBit(mb_bsk) => mb_bsk.output_lwe_dimension(),
|
||||
Self::Classic(bsk) => bsk.output_lwe_dimension(),
|
||||
Self::MultiBit(mb_bsk) => mb_bsk.output_lwe_dimension(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -487,7 +487,9 @@ fn cpke_params_default_name(params: &CompactPublicKeyEncryptionParameters) -> St
|
||||
}
|
||||
|
||||
named_params_impl!( NoiseSquashingParameters =>
|
||||
V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
|
||||
);
|
||||
|
||||
named_params_impl!( NoiseSquashingCompressionParameters =>
|
||||
|
||||
Reference in New Issue
Block a user