chore(gpu): add a benchmark for 128-bit multi-bit noise squashing

- Also, remove the lut indexes concept from the 128-bit multi-bit pbs. It's assumed not to exist by the entire backend (as it doesn't for classical PBS). So to keep it here would be a bit error prone.
This commit is contained in:
Pedro Alves
2025-09-04 12:40:32 -03:00
committed by Pedro Alves
parent b566d78621
commit c78cc2d2e9
13 changed files with 259 additions and 213 deletions

View File

@@ -1066,8 +1066,18 @@ template <typename InputTorus> struct int_noise_squashing_lut {
release_radix_ciphertext_async(streams[0], gpu_indexes[0],
tmp_lwe_before_ks, gpu_memory_allocated);
for (int i = 0; i < pbs_buffer.size(); i++) {
cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
&pbs_buffer[i]);
switch (params.pbs_type) {
case MULTI_BIT:
cleanup_cuda_multi_bit_programmable_bootstrap_128(
streams[i], gpu_indexes[i], &pbs_buffer[i]);
break;
case CLASSICAL:
cleanup_cuda_programmable_bootstrap_128(streams[i], gpu_indexes[i],
&pbs_buffer[i]);
break;
default:
PANIC("Cuda error (PBS): unknown PBS type. ")
}
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
if (lwe_aligned_gather_vec.size() > 0) {

View File

@@ -47,12 +47,11 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void const *lwe_array_in, void const *lwe_input_indexes,
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap_128(void *stream,
const uint32_t gpu_index,

View File

@@ -347,18 +347,12 @@ void execute_pbs_async(
auto current_lwe_input_indexes =
get_variant_element(lwe_input_indexes, i);
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
streams[i], gpu_indexes[i], current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
num_inputs_on_gpu, num_many_lut, lut_stride);
current_lwe_output_indexes, lut_vec[i], current_lwe_array_in,
current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i],
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
base_log, level_count, num_inputs_on_gpu, num_many_lut, lut_stride);
}
break;
case CLASSICAL:

View File

@@ -120,8 +120,8 @@ template <typename InputTorus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
__uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -131,45 +131,45 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
case 256:
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 512:
host_multi_bit_programmable_bootstrap_128<InputTorus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 1024:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 2048:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 4096:
host_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -182,8 +182,8 @@ template <typename InputTorus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
__uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -194,46 +194,46 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case 4096:
host_cg_multi_bit_programmable_bootstrap_128<InputTorus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
lwe_output_indexes, lut_vector, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, grouping_factor, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -245,12 +245,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
void const *lwe_array_in, void const *lwe_input_indexes,
void const *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
@@ -263,7 +262,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
uint64_t>(stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const __uint128_t *>(bootstrapping_key), buffer,
@@ -276,7 +274,6 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_128(
stream, gpu_index, static_cast<__uint128_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const __uint128_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint64_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const __uint128_t *>(bootstrapping_key), buffer,

View File

@@ -136,7 +136,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
const InputTorus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_input_indexes,
const __uint128_t *__restrict__ lut_vector,
const InputTorus *__restrict__ lut_vector_indexes,
__uint128_t *global_accumulator, double *global_accumulator_fft,
uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
@@ -169,8 +168,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
auto block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] *
params::degree * (glwe_dimension + 1)];
auto block_lut_vector = lut_vector;
auto global_slice =
&global_accumulator[(blockIdx.y + blockIdx.x * (glwe_dimension + 1)) *
@@ -368,7 +366,6 @@ __global__ void __launch_bounds__(params::degree / params::opt)
__uint128_t *lwe_array_out,
const InputTorus *__restrict__ lwe_output_indexes,
const __uint128_t *__restrict__ lut_vector,
const InputTorus *__restrict__ lut_vector_indexes,
const InputTorus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_input_indexes,
const double *__restrict__ keybundle_array, double *join_buffer,
@@ -409,8 +406,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
auto block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
auto block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.x] *
params::degree * (glwe_dimension + 1)];
auto block_lut_vector = lut_vector;
auto block_join_buffer =
&join_buffer[blockIdx.x * level_count * (glwe_dimension + 1) *
@@ -591,8 +587,7 @@ __host__ void execute_compute_keybundle_128(
template <typename InputTorus, class params, bool is_first_iter>
__host__ void execute_step_one_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes,
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count) {
@@ -618,27 +613,26 @@ __host__ void execute_step_one_128(
device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
InputTorus, params, NOSM, is_first_iter>
<<<grid_accumulate_step_one, thds, 0, stream>>>(
lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
global_accumulator, global_accumulator_fft, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, d_mem,
lwe_array_in, lwe_input_indexes, lut_vector, global_accumulator,
global_accumulator_fft, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, d_mem,
full_sm_accumulate_step_one);
else if (max_shared_memory < full_sm_accumulate_step_one)
device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
InputTorus, params, PARTIALSM, is_first_iter>
<<<grid_accumulate_step_one, thds, partial_sm_accumulate_step_one,
stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
lut_vector_indexes, global_accumulator,
global_accumulator_fft, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, d_mem,
partial_sm_accumulate_step_one);
global_accumulator, global_accumulator_fft, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count,
d_mem, partial_sm_accumulate_step_one);
else
device_multi_bit_programmable_bootstrap_accumulate_step_one_128<
InputTorus, params, FULLSM, is_first_iter>
<<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
lut_vector_indexes, global_accumulator,
global_accumulator_fft, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, d_mem, 0);
global_accumulator, global_accumulator_fft, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count,
d_mem, 0);
check_cuda_error(cudaGetLastError());
}
@@ -691,8 +685,8 @@ template <typename InputTorus, class params>
__host__ void host_multi_bit_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
__uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -717,14 +711,14 @@ __host__ void host_multi_bit_programmable_bootstrap_128(
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
if (is_first_iter) {
execute_step_one_128<InputTorus, params, true>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count);
} else {
execute_step_one_128<InputTorus, params, false>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
base_log, level_count);
}
if (is_last_iter) {
@@ -745,9 +739,8 @@ __host__ void host_multi_bit_programmable_bootstrap_128(
template <typename InputTorus, class params>
__host__ void execute_cg_external_product_loop_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes,
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
__uint128_t *lwe_array_out, InputTorus const *lwe_output_indexes,
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
@@ -780,46 +773,45 @@ __host__ void execute_cg_external_product_loop_128(
auto global_accumulator = buffer->global_accumulator;
auto join_buffer = buffer->global_join_buffer;
void *kernel_args[22];
void *kernel_args[21];
kernel_args[0] = &lwe_array_out;
kernel_args[1] = &lwe_output_indexes;
kernel_args[2] = &lut_vector;
kernel_args[3] = &lut_vector_indexes;
kernel_args[4] = &lwe_array_in;
kernel_args[5] = &lwe_input_indexes;
kernel_args[6] = &keybundle_fft;
kernel_args[7] = &join_buffer;
kernel_args[8] = &global_accumulator;
kernel_args[9] = &lwe_dimension;
kernel_args[10] = &glwe_dimension;
kernel_args[11] = &polynomial_size;
kernel_args[12] = &base_log;
kernel_args[13] = &level_count;
kernel_args[14] = &grouping_factor;
kernel_args[15] = &lwe_offset;
kernel_args[16] = &chunk_size;
kernel_args[17] = &keybundle_size_per_input;
kernel_args[18] = &d_mem;
kernel_args[20] = &num_many_lut;
kernel_args[21] = &lut_stride;
kernel_args[3] = &lwe_array_in;
kernel_args[4] = &lwe_input_indexes;
kernel_args[5] = &keybundle_fft;
kernel_args[6] = &join_buffer;
kernel_args[7] = &global_accumulator;
kernel_args[8] = &lwe_dimension;
kernel_args[9] = &glwe_dimension;
kernel_args[10] = &polynomial_size;
kernel_args[11] = &base_log;
kernel_args[12] = &level_count;
kernel_args[13] = &grouping_factor;
kernel_args[14] = &lwe_offset;
kernel_args[15] = &chunk_size;
kernel_args[16] = &keybundle_size_per_input;
kernel_args[17] = &d_mem;
kernel_args[19] = &num_many_lut;
kernel_args[20] = &lut_stride;
dim3 grid_accumulate(num_samples, glwe_dimension + 1, level_count);
dim3 thds(polynomial_size / params::opt, 1, 1);
if (max_shared_memory < partial_dm) {
kernel_args[19] = &full_dm;
kernel_args[18] = &full_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
InputTorus, params, NOSM>,
grid_accumulate, thds, (void **)kernel_args, 0, stream));
} else if (max_shared_memory < full_dm) {
kernel_args[19] = &partial_dm;
kernel_args[18] = &partial_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
InputTorus, params, PARTIALSM>,
grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
} else {
kernel_args[19] = &no_dm;
kernel_args[18] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate_128<
InputTorus, params, FULLSM>,
@@ -831,8 +823,8 @@ template <typename InputTorus, class params>
__host__ void host_cg_multi_bit_programmable_bootstrap_128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *lwe_array_out,
InputTorus const *lwe_output_indexes, __uint128_t const *lut_vector,
InputTorus const *lut_vector_indexes, InputTorus const *lwe_array_in,
InputTorus const *lwe_input_indexes, __uint128_t const *bootstrapping_key,
InputTorus const *lwe_array_in, InputTorus const *lwe_input_indexes,
__uint128_t const *bootstrapping_key,
pbs_buffer_128<InputTorus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
@@ -851,11 +843,10 @@ __host__ void host_cg_multi_bit_programmable_bootstrap_128(
// Accumulate
execute_cg_external_product_loop_128<InputTorus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, base_log, level_count, lwe_offset, num_many_lut,
lut_stride);
stream, gpu_index, lut_vector, lwe_array_in, lwe_input_indexes,
lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
lwe_offset, num_many_lut, lut_stride);
}
}

View File

@@ -2698,7 +2698,6 @@ unsafe extern "C" {
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lut_vector_indexes: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,

View File

@@ -1,8 +1,12 @@
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
#[cfg(feature = "gpu")]
use benchmark::params_aliases::{
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};
#[cfg(not(feature = "gpu"))]
@@ -25,8 +29,12 @@ use tfhe::prelude::*;
#[cfg(feature = "gpu")]
use tfhe::core_crypto::gpu::get_number_of_gpus;
use tfhe::shortint::parameters::{
CompressionParameters, NoiseSquashingCompressionParameters, NoiseSquashingParameters,
};
use tfhe::shortint::PBSParameters;
#[cfg(feature = "gpu")]
use tfhe::{set_server_key, GpuIndex};
use tfhe::GpuIndex;
use tfhe::{
ClientKey, CompressedCiphertextListBuilder, CompressedServerKey,
CompressedSquashedNoiseCiphertextListBuilder, FheUint10, FheUint12, FheUint128, FheUint14,
@@ -36,18 +44,42 @@ use tfhe::{
fn bench_sns_only_fhe_type<FheType>(
c: &mut Criterion,
client_key: &ClientKey,
params: (
PBSParameters,
NoiseSquashingParameters,
NoiseSquashingCompressionParameters,
CompressionParameters,
),
type_name: &str,
num_bits: usize,
) where
FheType: FheEncrypt<u128, ClientKey> + Send + Sync,
FheType: SquashNoise,
{
let (param, noise_param, _, _) = params;
use tfhe::{set_server_key, ConfigBuilder};
let config = ConfigBuilder::with_custom_parameters(param)
.enable_noise_squashing(noise_param)
.build();
let client_key = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&client_key);
#[cfg(feature = "gpu")]
set_server_key(compressed_sks.decompress_to_gpu());
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
{
let decompressed_sks = compressed_sks.decompress();
rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
set_server_key(decompressed_sks);
}
let mut bench_group = c.benchmark_group(type_name);
let bench_id_prefix = if cfg!(feature = "gpu") {
"hlapi::cuda"
format!("hlapi::cuda::{}", noise_param.name())
} else {
"hlapi"
"hlapi".to_string()
};
let bench_id_suffix = format!("noise_squash::{type_name}");
@@ -60,9 +92,9 @@ fn bench_sns_only_fhe_type<FheType>(
bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
#[cfg(feature = "gpu")]
configure_gpu(client_key);
configure_gpu(&client_key);
let input = FheType::encrypt(rng.gen(), client_key);
let input = FheType::encrypt(rng.gen(), &client_key);
bench_group.bench_function(&bench_id, |b| {
b.iter(|| {
@@ -82,7 +114,7 @@ fn bench_sns_only_fhe_type<FheType>(
bench_group.throughput(Throughput::Elements(elements));
println!("elements: {elements}");
let gpu_count = get_number_of_gpus() as usize;
let compressed_server_key = CompressedServerKey::new(client_key);
let compressed_server_key = CompressedServerKey::new(&client_key);
let sks_vec = (0..gpu_count)
.map(|i| {
compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
@@ -92,7 +124,7 @@ fn bench_sns_only_fhe_type<FheType>(
bench_group.bench_function(&bench_id, |b| {
let encrypt_values = || {
(0..elements)
.map(|_| FheType::encrypt(rng.gen(), client_key))
.map(|_| FheType::encrypt(rng.gen(), &client_key))
.collect::<Vec<_>>()
};
@@ -118,7 +150,7 @@ fn bench_sns_only_fhe_type<FheType>(
bench_group.bench_function(&bench_id, |b| {
let encrypt_values = || {
(0..elements)
.map(|_| FheType::encrypt(rng.gen(), client_key))
.map(|_| FheType::encrypt(rng.gen(), &client_key))
.collect::<Vec<_>>()
};
@@ -150,7 +182,12 @@ fn bench_sns_only_fhe_type<FheType>(
fn bench_decomp_sns_comp_fhe_type<FheType>(
c: &mut Criterion,
client_key: &ClientKey,
params: (
PBSParameters,
NoiseSquashingParameters,
NoiseSquashingCompressionParameters,
CompressionParameters,
),
type_name: &str,
num_bits: usize,
) where
@@ -158,11 +195,32 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
FheType: SquashNoise + Tagged + HlExpandable + HlCompressible,
<FheType as SquashNoise>::Output: HlSquashedNoiseCompressible,
{
let (param, noise_param, comp_noise_param, comp_param) = params;
use tfhe::{set_server_key, ConfigBuilder};
let config = ConfigBuilder::with_custom_parameters(param)
.enable_noise_squashing(noise_param)
.enable_noise_squashing_compression(comp_noise_param)
.enable_compression(comp_param)
.build();
let client_key = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&client_key);
#[cfg(feature = "gpu")]
set_server_key(compressed_sks.decompress_to_gpu());
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
{
let decompressed_sks = compressed_sks.decompress();
rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
set_server_key(decompressed_sks);
}
let mut bench_group = c.benchmark_group(type_name);
let bench_id_prefix = if cfg!(feature = "gpu") {
"hlapi::cuda"
format!("hlapi::cuda::{}", noise_param.name())
} else {
"hlapi"
"hlapi".to_string()
};
let bench_id_suffix = format!("decomp_noise_squash_comp::{type_name}");
@@ -175,9 +233,9 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
bench_id = format!("{bench_id_prefix}::{bench_id_suffix}");
#[cfg(feature = "gpu")]
configure_gpu(client_key);
configure_gpu(&client_key);
let input = FheType::encrypt(rng.gen(), client_key);
let input = FheType::encrypt(rng.gen(), &client_key);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(input);
@@ -205,7 +263,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
bench_group.throughput(Throughput::Elements(elements));
println!("elements: {elements}");
let gpu_count = get_number_of_gpus() as usize;
let compressed_server_key = CompressedServerKey::new(client_key);
let compressed_server_key = CompressedServerKey::new(&client_key);
let sks_vec = (0..gpu_count)
.map(|i| {
compressed_server_key.decompress_to_specific_gpu(GpuIndex::new(i as u32))
@@ -216,7 +274,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
let compressed_values = || {
(0..elements)
.map(|_| {
let input = FheType::encrypt(rng.gen(), client_key);
let input = FheType::encrypt(rng.gen(), &client_key);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(input);
builder.build().unwrap()
@@ -254,7 +312,7 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
let compressed_values = || {
(0..elements)
.map(|_| {
let input = FheType::encrypt(rng.gen(), client_key);
let input = FheType::encrypt(rng.gen(), &client_key);
let mut builder = CompressedCiphertextListBuilder::new();
builder.push(input);
builder.build().unwrap()
@@ -296,8 +354,10 @@ fn bench_decomp_sns_comp_fhe_type<FheType>(
macro_rules! bench_sns_only_type {
($fhe_type:ident) => {
::paste::paste! {
fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
bench_sns_only_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
fn [<bench_sns_only_ $fhe_type:snake>](c: &mut Criterion, params: &[(PBSParameters, NoiseSquashingParameters, NoiseSquashingCompressionParameters, CompressionParameters)]) {
for param in params {
bench_sns_only_fhe_type::<$fhe_type>(c, *param, stringify!($fhe_type), $fhe_type::num_bits());
}
}
}
};
@@ -306,8 +366,10 @@ macro_rules! bench_sns_only_type {
macro_rules! bench_decomp_sns_comp_type {
($fhe_type:ident) => {
::paste::paste! {
fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, cks: &ClientKey) {
bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, cks, stringify!($fhe_type), $fhe_type::num_bits());
fn [<bench_decomp_sns_comp_ $fhe_type:snake>](c: &mut Criterion, params: &[(PBSParameters, NoiseSquashingParameters, NoiseSquashingCompressionParameters, CompressionParameters)]) {
for param in params {
bench_decomp_sns_comp_fhe_type::<$fhe_type>(c, *param, stringify!($fhe_type), $fhe_type::num_bits());
}
}
}
};
@@ -330,65 +392,55 @@ bench_decomp_sns_comp_type!(FheUint64);
fn main() {
#[cfg(feature = "hpu")]
panic!("Noise squashing is not supported on HPU");
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
let cks = {
use benchmark::params_aliases::BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
use tfhe::{set_server_key, ConfigBuilder};
let config = ConfigBuilder::with_custom_parameters(
BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_noise_squashing(BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
.enable_noise_squashing_compression(
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_compression(BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128)
.build();
let cks = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&cks);
let decompressed_sks = compressed_sks.decompress();
rayon::broadcast(|_| set_server_key(decompressed_sks.clone()));
set_server_key(decompressed_sks);
cks
};
#[cfg(feature = "gpu")]
let cks = {
use tfhe::{set_server_key, ConfigBuilder};
let config = ConfigBuilder::with_custom_parameters(
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_noise_squashing(
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_noise_squashing_compression(
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.enable_compression(
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)
.build();
let cks = ClientKey::generate(config);
let compressed_sks = CompressedServerKey::new(&cks);
let params: Vec<(
PBSParameters,
NoiseSquashingParameters,
NoiseSquashingCompressionParameters,
CompressionParameters,
)> = {
#[cfg(all(not(feature = "hpu"), not(feature = "gpu")))]
{
vec![(
BENCH_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
BENCH_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
)]
}
set_server_key(compressed_sks.decompress_to_gpu());
cks
#[cfg(feature = "gpu")]
{
vec![(
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
), (
BENCH_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128.into(),
BENCH_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
BENCH_COMP_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
),
]
}
};
let mut c = Criterion::default().configure_from_args();
bench_sns_only_fhe_uint2(&mut c, &cks);
bench_sns_only_fhe_uint4(&mut c, &cks);
bench_sns_only_fhe_uint6(&mut c, &cks);
bench_sns_only_fhe_uint8(&mut c, &cks);
bench_sns_only_fhe_uint10(&mut c, &cks);
bench_sns_only_fhe_uint12(&mut c, &cks);
bench_sns_only_fhe_uint14(&mut c, &cks);
bench_sns_only_fhe_uint16(&mut c, &cks);
bench_sns_only_fhe_uint32(&mut c, &cks);
bench_sns_only_fhe_uint64(&mut c, &cks);
bench_sns_only_fhe_uint128(&mut c, &cks);
bench_sns_only_fhe_uint2(&mut c, params.as_slice());
bench_sns_only_fhe_uint4(&mut c, params.as_slice());
bench_sns_only_fhe_uint6(&mut c, params.as_slice());
bench_sns_only_fhe_uint8(&mut c, params.as_slice());
bench_sns_only_fhe_uint10(&mut c, params.as_slice());
bench_sns_only_fhe_uint12(&mut c, params.as_slice());
bench_sns_only_fhe_uint14(&mut c, params.as_slice());
bench_sns_only_fhe_uint16(&mut c, params.as_slice());
bench_sns_only_fhe_uint32(&mut c, params.as_slice());
bench_sns_only_fhe_uint64(&mut c, params.as_slice());
bench_sns_only_fhe_uint128(&mut c, params.as_slice());
bench_decomp_sns_comp_fhe_uint64(&mut c, &cks);
bench_decomp_sns_comp_fhe_uint64(&mut c, params.as_slice());
c.final_summary();
}

View File

@@ -7,6 +7,7 @@ pub mod shortint_params_aliases {
ClassicPBSParameters, CompactPublicKeyEncryptionParameters, CompressionParameters,
MultiBitPBSParameters, NoiseSquashingCompressionParameters, NoiseSquashingParameters,
ShortintKeySwitchingParameters,
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};
// KS PBS Gaussian
pub const BENCH_PARAM_MESSAGE_1_CARRY_1_KS_PBS_GAUSSIAN_2M128: ClassicPBSParameters =
@@ -143,6 +144,10 @@ pub mod shortint_params_aliases {
NoiseSquashingParameters =
V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
pub const BENCH_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
NoiseSquashingParameters =
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;
pub const BENCH_COMP_NOISE_SQUASHING_PARAM_GPU_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128:
NoiseSquashingCompressionParameters =
V1_4_NOISE_SQUASHING_COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128;

View File

@@ -140,13 +140,8 @@ fn execute_multibit_bootstrap_u128(
let d_accumulator =
CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
let test_vector_indexes: Vec<u64> = vec![0; par_lwe_list.lwe_ciphertext_count().0];
let mut d_test_vector_indexes =
unsafe { CudaVec::<u64>::new_async(number_of_messages, &stream, 0) };
unsafe {
d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0)
};
// We initialize it so cargo won't complain, but we don't use it internally
let d_test_vector_indexes = unsafe { CudaVec::<u64>::new_async(1, &stream, 0) };
let num_blocks = d_lwe_ciphertext_in.lwe_ciphertext_count().0;
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();

View File

@@ -399,7 +399,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
lwe_array_out.as_mut_c_ptr(0),
output_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
input_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),

View File

@@ -4,9 +4,12 @@ use crate::high_level_api::{
};
use crate::integer::U256;
use crate::set_server_key;
#[cfg(feature = "gpu")]
use crate::shortint::parameters::{
NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};
use crate::shortint::parameters::{
NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
};

View File

@@ -32,8 +32,8 @@ pub enum CudaBootstrappingKey<Scalar: UnsignedInteger> {
impl<Scalar: UnsignedInteger> CudaBootstrappingKey<Scalar> {
pub(crate) fn output_lwe_dimension(&self) -> LweDimension {
match self {
CudaBootstrappingKey::Classic(bsk) => bsk.output_lwe_dimension(),
CudaBootstrappingKey::MultiBit(mb_bsk) => mb_bsk.output_lwe_dimension(),
Self::Classic(bsk) => bsk.output_lwe_dimension(),
Self::MultiBit(mb_bsk) => mb_bsk.output_lwe_dimension(),
}
}
}

View File

@@ -487,7 +487,9 @@ fn cpke_params_default_name(params: &CompactPublicKeyEncryptionParameters) -> St
}
named_params_impl!( NoiseSquashingParameters =>
V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
V1_4_NOISE_SQUASHING_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
V1_4_NOISE_SQUASHING_PARAM_GPU_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
V1_4_NOISE_SQUASHING_PARAM_GPU_MULTI_BIT_GROUP_4_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M128,
);
named_params_impl!( NoiseSquashingCompressionParameters =>