mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-06 21:34:05 -05:00
feat(gpu): support keyswitch 64/32 in PBS
This commit is contained in:
@@ -7,23 +7,23 @@ template <typename Torus>
|
||||
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
@@ -48,7 +48,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -96,7 +96,7 @@ template <typename Torus>
|
||||
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
uint32_t polynomial_size);
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
uint32_t level_count, uint64_t full_sm_keybundle);
|
||||
|
||||
@@ -428,29 +428,29 @@ uint64_t get_buffer_size_programmable_bootstrap_cg(
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -458,11 +458,11 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
uint32_t lut_stride);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -477,14 +477,14 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
#endif
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
|
||||
@@ -69,13 +69,19 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
uint64_t scratch_cuda_programmable_bootstrap_32_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
uint64_t scratch_cuda_programmable_bootstrap_128(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
@@ -84,7 +90,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
|
||||
@@ -4,12 +4,12 @@
|
||||
#include "pbs_enums.h"
|
||||
#include "stdint.h"
|
||||
|
||||
extern "C" {
|
||||
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
extern "C" {
|
||||
|
||||
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
|
||||
void *stream, uint32_t gpu_index, void *dest, void const *src,
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
@@ -20,6 +20,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
|
||||
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -35,6 +40,16 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
uint32_t gpu_index,
|
||||
int8_t **pbs_buffer);
|
||||
|
||||
@@ -535,7 +535,6 @@ __host__ void host_gemm_keyswitch_lwe_ciphertext_vector(
|
||||
dim3 grid_negate(CEIL_DIV(lwe_dimension_out + 1, BLOCK_SIZE_DECOMP),
|
||||
CEIL_DIV(num_samples, BLOCK_SIZE_DECOMP));
|
||||
dim3 threads_negate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
|
||||
|
||||
// Negate all outputs in the output LWEs. This is the final step in the GEMM
|
||||
// keyswitch computed as: -(-b + sum(a_i A_KSK))
|
||||
keyswitch_negate_with_output_indices<Torus, KSTorus>
|
||||
|
||||
@@ -263,42 +263,7 @@ void execute_pbs_async(CudaStreams streams,
|
||||
uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
|
||||
if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
|
||||
// 32 bits
|
||||
switch (pbs_type) {
|
||||
case MULTI_BIT:
|
||||
PANIC("Error: 32-bit multibit PBS is not supported.\n")
|
||||
case CLASSICAL:
|
||||
for (uint i = 0; i < streams.count(); i++) {
|
||||
int num_inputs_on_gpu = get_num_inputs_on_gpu(
|
||||
input_lwe_ciphertext_count, i, streams.count());
|
||||
|
||||
int gpu_offset =
|
||||
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
// Use the macro to get the correct elements for the current iteration
|
||||
// Handles the case when the input/output are scattered through
|
||||
// different gpus and when it is not
|
||||
auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
|
||||
auto current_lwe_output_indexes =
|
||||
get_variant_element(lwe_output_indexes, i);
|
||||
auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
|
||||
auto current_lwe_input_indexes =
|
||||
get_variant_element(lwe_input_indexes, i);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_inputs_on_gpu,
|
||||
num_many_lut, lut_stride);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
PANIC("Error: unsupported cuda PBS type.")
|
||||
}
|
||||
PANIC("Error: unsupported 32b CUDA PBS type.")
|
||||
} else if constexpr (std::is_same_v<OutputTorus, uint64_t>) {
|
||||
// 64 bits
|
||||
switch (pbs_type) {
|
||||
@@ -353,7 +318,7 @@ void execute_pbs_async(CudaStreams streams,
|
||||
auto d_lut_vector_indexes =
|
||||
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
|
||||
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
|
||||
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
|
||||
current_lwe_array_in, current_lwe_input_indexes,
|
||||
|
||||
@@ -35,12 +35,13 @@ namespace cg = cooperative_groups;
|
||||
*
|
||||
* Each y-block computes one element of the lwe_array_out.
|
||||
*/
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_cg(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -79,7 +80,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
@@ -95,8 +96,8 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
constexpr auto log_modulus = params::log2_degree + 1;
|
||||
Torus b_hat = 0;
|
||||
Torus correction = 0;
|
||||
InputTorus b_hat = 0;
|
||||
InputTorus correction = 0;
|
||||
if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
|
||||
correction = centered_binary_modulus_switch_body_correction_to_add(
|
||||
block_lwe_array_in, lwe_dimension, log_modulus);
|
||||
@@ -113,7 +114,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
__syncthreads();
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
InputTorus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
@@ -193,7 +194,7 @@ __global__ void device_programmable_bootstrap_cg(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, typename params>
|
||||
template <typename InputTorus, typename Torus, typename params>
|
||||
__host__ uint64_t scratch_programmable_bootstrap_cg(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
|
||||
@@ -210,18 +211,18 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -237,11 +238,11 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
|
||||
/*
|
||||
* Host wrapper
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap_cg(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -293,26 +294,29 @@ __host__ void host_programmable_bootstrap_cg(
|
||||
if (max_shared_memory < partial_sm) {
|
||||
kernel_args[13] = &full_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, grid,
|
||||
thds, (void **)kernel_args, 0, stream));
|
||||
(void *)
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, NOSM>,
|
||||
grid, thds, (void **)kernel_args, 0, stream));
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
kernel_args[13] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
(void *)device_programmable_bootstrap_cg<InputTorus, Torus, params,
|
||||
PARTIALSM>,
|
||||
grid, thds, (void **)kernel_args, partial_sm, stream));
|
||||
} else {
|
||||
int no_dm = 0;
|
||||
kernel_args[13] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, grid,
|
||||
thds, (void **)kernel_args, full_sm, stream));
|
||||
(void *)
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
|
||||
grid, thds, (void **)kernel_args, full_sm, stream));
|
||||
}
|
||||
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
@@ -338,30 +342,34 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
if (max_shared_memory < partial_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, thds, 0);
|
||||
(void *)
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, NOSM>,
|
||||
thds, 0);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
(void *)device_programmable_bootstrap_cg<InputTorus, Torus, params,
|
||||
PARTIALSM>,
|
||||
thds, partial_sm);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, thds,
|
||||
full_sm);
|
||||
(void *)
|
||||
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
|
||||
thds, full_sm);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
@@ -372,39 +380,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
__host__ bool supports_cooperative_groups_on_programmable_bootstrap(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 512:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 1024:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 2048:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 4096:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 8192:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 16384:
|
||||
return verify_cuda_programmable_bootstrap_cg_grid_size<
|
||||
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<16384>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
|
||||
"Supported N's are powers of two"
|
||||
|
||||
@@ -18,13 +18,14 @@
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
|
||||
Torus *global_accumulator, uint32_t lwe_dimension,
|
||||
@@ -60,7 +61,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
@@ -80,7 +81,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
InputTorus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
@@ -208,7 +209,7 @@ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename params>
|
||||
template <typename InputTorus, typename Torus, typename params>
|
||||
__host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
@@ -230,57 +231,59 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (max_shared_memory < partial_sm_cg_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory < full_sm_cg_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
|
||||
level_count, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
@@ -291,10 +294,10 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void execute_cg_external_product_loop(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
@@ -359,28 +362,28 @@ __host__ void execute_cg_external_product_loop(
|
||||
kernel_args[19] = &full_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, NOSM>,
|
||||
InputTorus, Torus, params, NOSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, 0, stream));
|
||||
} else if (max_shared_memory < full_dm) {
|
||||
kernel_args[19] = &partial_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, PARTIALSM>,
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
|
||||
} else {
|
||||
kernel_args[19] = &no_dm;
|
||||
check_cuda_error(cudaLaunchCooperativeKernel(
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
@@ -393,13 +396,13 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
execute_compute_keybundle<InputTorus, Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
// Accumulate
|
||||
execute_cg_external_product_loop<Torus, params>(
|
||||
execute_cg_external_product_loop<InputTorus, Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
@@ -409,7 +412,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
@@ -436,36 +439,36 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, NOSM>,
|
||||
InputTorus, Torus, params, NOSM>,
|
||||
thds, 0);
|
||||
} else if (max_shared_memory < full_sm_cg_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, PARTIALSM>,
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
thds, partial_sm_cg_accumulate);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
thds, full_sm_cg_accumulate);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -478,39 +481,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
|
||||
|
||||
// Verify if the grid size for the multi-bit kernel satisfies the cooperative
|
||||
// group constraints
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
__host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
|
||||
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<256>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 512:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<512>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 1024:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<1024>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 2048:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<2048>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 4096:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<4096>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 8192:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<8192>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
case 16384:
|
||||
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
|
||||
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<16384>>(
|
||||
glwe_dimension, level_count, num_samples, max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
|
||||
@@ -7,18 +7,19 @@
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t level_count,
|
||||
uint32_t num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
|
||||
return supports_cooperative_groups_on_programmable_bootstrap<InputTorus,
|
||||
Torus>(
|
||||
glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
@@ -28,38 +29,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 512:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 1024:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, Degree<2048>>(num_samples, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory);
|
||||
InputTorus, Torus, Degree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 8192:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 16384:
|
||||
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
@@ -71,7 +73,7 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
|
||||
}
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_programmable_bootstrap_tbc(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -80,37 +82,43 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
|
||||
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
|
||||
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
|
||||
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_tbc<Torus, Degree<2048>>(
|
||||
return scratch_programmable_bootstrap_tbc<InputTorus, Torus, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
|
||||
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 8192:
|
||||
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
|
||||
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 16384:
|
||||
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
|
||||
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
@@ -121,11 +129,11 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -134,7 +142,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
|
||||
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -142,7 +150,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
|
||||
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -150,7 +158,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
|
||||
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -158,7 +166,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_tbc<Torus, Degree<2048>>(
|
||||
host_programmable_bootstrap_tbc<InputTorus, Torus, Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -166,7 +174,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
|
||||
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -174,7 +182,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
|
||||
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -182,7 +190,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
|
||||
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -197,7 +205,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
|
||||
}
|
||||
#endif
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_programmable_bootstrap_cg(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -206,37 +214,44 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
|
||||
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
|
||||
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
|
||||
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
|
||||
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
|
||||
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 8192:
|
||||
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
|
||||
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 16384:
|
||||
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
|
||||
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
@@ -247,7 +262,7 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -256,37 +271,44 @@ uint64_t scratch_cuda_programmable_bootstrap(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
return scratch_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 512:
|
||||
return scratch_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
return scratch_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 1024:
|
||||
return scratch_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
return scratch_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 2048:
|
||||
return scratch_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
return scratch_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 4096:
|
||||
return scratch_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
return scratch_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 8192:
|
||||
return scratch_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
return scratch_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
case 16384:
|
||||
return scratch_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
return scratch_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
@@ -299,11 +321,10 @@ uint64_t scratch_cuda_programmable_bootstrap(
|
||||
|
||||
/*
|
||||
* This scratch function allocates the necessary amount of data on the GPU for
|
||||
* the classical PBS on 32 bits inputs, into `buffer`. It also
|
||||
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
|
||||
* be used.
|
||||
* the PBS on 64 bits inputs, into `buffer`. It also configures SM options on
|
||||
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
|
||||
*/
|
||||
uint64_t scratch_cuda_programmable_bootstrap_32(
|
||||
uint64_t scratch_cuda_programmable_bootstrap_32_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
@@ -311,25 +332,25 @@ uint64_t scratch_cuda_programmable_bootstrap_32(
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
return scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
return scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
return scratch_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
return scratch_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
else
|
||||
return scratch_cuda_programmable_bootstrap<uint32_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
|
||||
return scratch_cuda_programmable_bootstrap<uint32_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
}
|
||||
@@ -347,34 +368,34 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
|
||||
|
||||
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
|
||||
#if (CUDA_ARCH >= 900)
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, max_shared_memory))
|
||||
return scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
return scratch_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
else
|
||||
#endif
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, max_shared_memory))
|
||||
return scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
return scratch_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
else
|
||||
return scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
return scratch_cuda_programmable_bootstrap<uint64_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -383,7 +404,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
|
||||
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -391,7 +412,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
|
||||
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -399,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
|
||||
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -407,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
|
||||
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -415,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
|
||||
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -423,7 +444,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
|
||||
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -431,7 +452,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
|
||||
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -445,11 +466,11 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -458,7 +479,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -466,7 +487,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -474,7 +495,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -482,7 +503,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -490,7 +511,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -498,7 +519,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 8192:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -506,7 +527,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case 16384:
|
||||
host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
|
||||
@@ -522,7 +543,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
|
||||
*/
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
@@ -530,23 +551,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
|
||||
if (base_log > 64)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 64")
|
||||
|
||||
if (base_log > 32)
|
||||
PANIC("Cuda error (classical PBS): base log should be <= 32")
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
|
||||
|
||||
pbs_buffer<uint32_t, CLASSICAL> *buffer =
|
||||
(pbs_buffer<uint32_t, CLASSICAL> *)mem_ptr;
|
||||
check_cuda_error(cudaGetLastError());
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<const uint32_t *>(lwe_output_indexes),
|
||||
static_cast<const uint32_t *>(lut_vector),
|
||||
static_cast<const uint32_t *>(lut_vector_indexes),
|
||||
case PBS_VARIANT::TBC:
|
||||
#if (CUDA_ARCH >= 900)
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
@@ -554,26 +576,26 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
#else
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
case CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<const uint32_t *>(lwe_output_indexes),
|
||||
static_cast<const uint32_t *>(lut_vector),
|
||||
static_cast<const uint32_t *>(lut_vector_indexes),
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t, uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
|
||||
static_cast<const uint32_t *>(lwe_output_indexes),
|
||||
static_cast<const uint32_t *>(lut_vector),
|
||||
static_cast<const uint32_t *>(lut_vector_indexes),
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t, uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint32_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
|
||||
num_many_lut, lut_stride);
|
||||
@@ -644,7 +666,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
@@ -663,7 +685,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if (CUDA_ARCH >= 900)
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
@@ -678,7 +700,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
PANIC("Cuda error (PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
@@ -690,7 +712,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
@@ -705,7 +727,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
PANIC("Cuda error (PBS): unknown pbs variant.")
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
|
||||
* bits inputs.
|
||||
@@ -718,11 +739,16 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
|
||||
*buffer = nullptr;
|
||||
}
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
template bool has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory);
|
||||
|
||||
template void
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
@@ -732,7 +758,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
template void
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
@@ -742,14 +769,14 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_cg<uint64_t>(
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
template uint64_t scratch_cuda_programmable_bootstrap<uint64_t>(
|
||||
template uint64_t scratch_cuda_programmable_bootstrap<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
@@ -765,38 +792,31 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
|
||||
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
|
||||
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_cg<uint32_t>(
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
template uint64_t scratch_cuda_programmable_bootstrap<uint32_t>(
|
||||
template uint64_t scratch_cuda_programmable_bootstrap<uint32_t, uint32_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
|
||||
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
template bool
|
||||
has_support_to_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
template bool
|
||||
has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if CUDA_ARCH >= 900
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
template void
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint32_t>(
|
||||
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
|
||||
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
|
||||
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
|
||||
@@ -805,7 +825,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
template void
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
@@ -814,18 +835,35 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
|
||||
template void
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
|
||||
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride);
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint32_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
|
||||
PBS_MS_REDUCTION_T noise_reduction_type);
|
||||
|
||||
template bool
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
__uint128_t>(uint32_t polynomial_size, uint32_t max_shared_memory);
|
||||
|
||||
@@ -17,12 +17,13 @@
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD, bool first_iter>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_programmable_bootstrap_step_one(
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, Torus *global_accumulator,
|
||||
double2 *global_join_buffer, uint32_t lwe_iteration,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -55,7 +56,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
@@ -75,8 +76,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
// First iteration
|
||||
// Put "b" in [0, 2N[
|
||||
constexpr auto log_modulus = params::log2_degree + 1;
|
||||
Torus b_hat = 0;
|
||||
Torus correction = 0;
|
||||
InputTorus b_hat = 0;
|
||||
InputTorus correction = 0;
|
||||
if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
|
||||
correction = centered_binary_modulus_switch_body_correction_to_add(
|
||||
block_lwe_array_in, lwe_dimension, log_modulus);
|
||||
@@ -86,6 +87,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
// The y-dimension is used to select the element of the GLWE this block will
|
||||
// compute
|
||||
// b_hat is cast from InputTorus (i.e. 64b or 32b) to uint32_t here
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
@@ -100,7 +102,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
}
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
InputTorus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
|
||||
params::log2_degree + 1); // 2 * params::log2_degree + 1);
|
||||
|
||||
@@ -307,7 +309,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
|
||||
return buffer_size + buffer_size % sizeof(double2);
|
||||
}
|
||||
|
||||
template <typename Torus, typename params>
|
||||
template <typename InputTorus, typename Torus, typename params>
|
||||
__host__ uint64_t scratch_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
|
||||
@@ -331,30 +333,38 @@ __host__ uint64_t scratch_programmable_bootstrap(
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, true>,
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
|
||||
PARTIALSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, true>,
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
|
||||
PARTIALSM, true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, false>,
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
|
||||
PARTIALSM, false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, false>,
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
|
||||
PARTIALSM, false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM, true>,
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
|
||||
FULLSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM, true>,
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
|
||||
FULLSM, true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM, false>,
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
|
||||
FULLSM, false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM, false>,
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
|
||||
FULLSM, false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -398,10 +408,10 @@ __host__ uint64_t scratch_programmable_bootstrap(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus, class params, bool first_iter>
|
||||
template <typename InputTorus, typename Torus, class params, bool first_iter>
|
||||
__host__ void execute_step_one(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
Torus *global_accumulator, double2 *global_join_buffer,
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
|
||||
@@ -416,21 +426,24 @@ __host__ void execute_step_one(
|
||||
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
|
||||
|
||||
if (max_shared_memory < partial_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, NOSM, first_iter>
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params, NOSM,
|
||||
first_iter>
|
||||
<<<grid, thds, 0, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
global_accumulator, global_join_buffer, lwe_iteration,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
full_dm, noise_reduction_type);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, first_iter>
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params, PARTIALSM,
|
||||
first_iter>
|
||||
<<<grid, thds, partial_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
global_accumulator, global_join_buffer, lwe_iteration,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
partial_dm, noise_reduction_type);
|
||||
} else {
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM, first_iter>
|
||||
device_programmable_bootstrap_step_one<InputTorus, Torus, params, FULLSM,
|
||||
first_iter>
|
||||
<<<grid, thds, full_sm, stream>>>(
|
||||
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
|
||||
global_accumulator, global_join_buffer, lwe_iteration,
|
||||
@@ -484,11 +497,11 @@ __host__ void execute_step_two(
|
||||
/*
|
||||
* Host wrapper to the programmable bootstrap
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -520,7 +533,7 @@ __host__ void host_programmable_bootstrap(
|
||||
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
if (i == 0) {
|
||||
execute_step_one<Torus, params, true>(
|
||||
execute_step_one<InputTorus, Torus, params, true>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
@@ -528,7 +541,7 @@ __host__ void host_programmable_bootstrap(
|
||||
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
|
||||
noise_reduction_type);
|
||||
} else {
|
||||
execute_step_one<Torus, params, false>(
|
||||
execute_step_one<InputTorus, Torus, params, false>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, global_accumulator,
|
||||
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
|
||||
|
||||
@@ -7,15 +7,16 @@
|
||||
#include "programmable_bootstrap_tbc_multibit.cuh"
|
||||
#endif
|
||||
|
||||
// Used in CPP benchmarks, not used in rust code
|
||||
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t max_shared_memory) {
|
||||
return supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
|
||||
max_shared_memory);
|
||||
uint64_t, uint64_t>(glwe_dimension, polynomial_size, level_count,
|
||||
num_samples, max_shared_memory);
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
@@ -26,39 +27,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 512:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 1024:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 2048:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 4096:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 8192:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
case 16384:
|
||||
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
|
||||
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
|
||||
polynomial_size, level_count,
|
||||
max_shared_memory);
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
@@ -69,11 +70,11 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
@@ -82,7 +83,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -90,7 +92,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -98,7 +101,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -106,7 +110,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -114,7 +119,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -122,7 +128,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 8192:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -130,7 +137,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 16384:
|
||||
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -144,11 +152,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
@@ -157,7 +165,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
|
||||
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -165,7 +174,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -173,7 +183,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -181,7 +192,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 2048:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -189,7 +201,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 4096:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -197,7 +210,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 8192:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -205,7 +219,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 16384:
|
||||
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -239,7 +254,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
@@ -254,7 +270,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
@@ -266,7 +283,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
@@ -282,7 +300,73 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
|
||||
void *stream, uint32_t gpu_index, void *lwe_array_out,
|
||||
void const *lwe_output_indexes, void const *lut_vector,
|
||||
void const *lut_vector_indexes, void const *lwe_array_in,
|
||||
void const *lwe_input_indexes, void const *bootstrapping_key,
|
||||
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
|
||||
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
|
||||
uint32_t lut_stride) {
|
||||
|
||||
PANIC_IF_FALSE(base_log <= 64,
|
||||
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
|
||||
base_log);
|
||||
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
|
||||
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
|
||||
|
||||
switch (buffer->pbs_variant) {
|
||||
case PBS_VARIANT::TBC:
|
||||
#if CUDA_ARCH >= 900
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
#else
|
||||
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
|
||||
#endif
|
||||
case PBS_VARIANT::CG:
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case PBS_VARIANT::DEFAULT:
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
|
||||
static_cast<const uint64_t *>(lwe_output_indexes),
|
||||
static_cast<const uint64_t *>(lut_vector),
|
||||
static_cast<const uint64_t *>(lut_vector_indexes),
|
||||
static_cast<const uint32_t *>(lwe_array_in),
|
||||
static_cast<const uint64_t *>(lwe_input_indexes),
|
||||
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
|
||||
}
|
||||
}
|
||||
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -290,43 +374,43 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 8192:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 16384:
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
@@ -338,7 +422,7 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -346,43 +430,43 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
return scratch_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 8192:
|
||||
return scratch_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 16384:
|
||||
return scratch_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
@@ -400,7 +484,8 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_cg =
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t,
|
||||
uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
|
||||
#if (CUDA_ARCH >= 900)
|
||||
@@ -412,24 +497,70 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
bool supports_tbc =
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t,
|
||||
uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
if (supports_tbc)
|
||||
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (supports_cg)
|
||||
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
}
|
||||
|
||||
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64(
|
||||
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_cg =
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint32_t,
|
||||
uint64_t>(
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
|
||||
#if (CUDA_ARCH >= 900)
|
||||
// On H100s we should be using TBC until num_samples < num_sms / 2.
|
||||
// After that we switch to CG until not supported anymore.
|
||||
// At this point we return to TBC.
|
||||
int num_sms = 0;
|
||||
check_cuda_error(cudaDeviceGetAttribute(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
bool supports_tbc =
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint32_t,
|
||||
uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
level_count, cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
if (supports_tbc)
|
||||
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint32_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
#endif
|
||||
if (supports_cg)
|
||||
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
else
|
||||
return scratch_cuda_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
|
||||
glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, allocate_gpu_memory);
|
||||
@@ -455,7 +586,7 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
|
||||
* The value 13 was empirically determined based on memory requirements for
|
||||
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
uint32_t polynomial_size, uint32_t glwe_dimension,
|
||||
uint32_t level_count, uint64_t full_sm_keybundle) {
|
||||
@@ -466,13 +597,14 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
if (max_shared_memory < full_sm_keybundle)
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
polynomial_size / params::opt, full_sm_keybundle);
|
||||
else
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_blocks_per_sm,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
polynomial_size / params::opt, 0);
|
||||
|
||||
int num_sms = 0;
|
||||
@@ -535,14 +667,15 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
|
||||
return (max_num_chunks > divisor) ? divisor : max_num_chunks;
|
||||
}
|
||||
|
||||
template uint64_t scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
template uint64_t
|
||||
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
@@ -552,14 +685,16 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
template uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
template uint64_t
|
||||
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
|
||||
uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
@@ -570,12 +705,12 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
template bool
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint32_t, uint64_t>(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory);
|
||||
|
||||
#if (CUDA_ARCH >= 900)
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
@@ -583,43 +718,43 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 512:
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 1024:
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 2048:
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 4096:
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 8192:
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
allocate_gpu_memory);
|
||||
case 16384:
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
|
||||
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
|
||||
polynomial_size, level_count, input_lwe_ciphertext_count,
|
||||
@@ -630,11 +765,11 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
|
||||
" in the interval [256..16384].")
|
||||
}
|
||||
}
|
||||
template <typename Torus>
|
||||
template <typename InputTorus, typename Torus>
|
||||
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
@@ -646,7 +781,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_tbc_multi_bit_programmable_bootstrap<uint64_t, AmortizedDegree<256>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -654,7 +790,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 512:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -662,7 +799,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 1024:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -675,14 +813,16 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
|
||||
|
||||
if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1))
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
lwe_dimension, polynomial_size, grouping_factor, base_log,
|
||||
level_count, num_samples, num_many_lut, lut_stride);
|
||||
else
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, Degree<2048>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -692,7 +832,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
break;
|
||||
}
|
||||
case 4096:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -700,7 +841,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 8192:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<8192>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -708,7 +850,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
num_samples, num_many_lut, lut_stride);
|
||||
break;
|
||||
case 16384:
|
||||
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
|
||||
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
|
||||
AmortizedDegree<16384>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
|
||||
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
|
||||
@@ -722,13 +865,15 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
|
||||
}
|
||||
}
|
||||
|
||||
template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
template uint64_t
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
|
||||
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
|
||||
|
||||
template void
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
|
||||
uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
|
||||
@@ -737,4 +882,16 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
|
||||
template void
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
|
||||
uint64_t>(
|
||||
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
|
||||
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
|
||||
uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
|
||||
uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
|
||||
uint32_t num_many_lut, uint32_t lut_stride);
|
||||
#endif
|
||||
|
||||
@@ -25,9 +25,10 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
|
||||
level_count;
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
@@ -55,7 +56,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
|
||||
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
@@ -86,10 +87,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
// Precalculate the monomial degrees and store them in shared memory
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
const Torus *lwe_array_group =
|
||||
const InputTorus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<InputTorus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -145,9 +147,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
|
||||
// Then we can just calculate the offset needed to apply this coefficients, and
|
||||
// the operation transforms into a pointwise vector multiplication, avoiding to
|
||||
// perform extra instructions other than MADD
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD>
|
||||
__global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
|
||||
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t lwe_offset, uint64_t lwe_chunk_size,
|
||||
@@ -188,7 +191,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
|
||||
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
|
||||
|
||||
double2 *keybundle = keybundle_array +
|
||||
@@ -219,10 +222,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
|
||||
|
||||
if (threadIdx.x < (1 << grouping_factor)) {
|
||||
const Torus *lwe_array_group =
|
||||
const InputTorus *lwe_array_group =
|
||||
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
|
||||
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
monomial_degrees[threadIdx.x] =
|
||||
calculates_monomial_degree<InputTorus, params>(
|
||||
lwe_array_group, threadIdx.x, grouping_factor);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
@@ -269,10 +273,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD, bool is_first_iter>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD, bool is_first_iter>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one(
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
|
||||
@@ -305,7 +310,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
@@ -327,7 +332,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
////////////////////////////////////////////////////////////
|
||||
// Initializes the accumulator with the body of LWE
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
InputTorus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
@@ -501,7 +506,7 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
|
||||
return sizeof(Torus) * polynomial_size; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus, typename params>
|
||||
template <typename InputTorus, typename Torus, typename params>
|
||||
__host__ uint64_t scratch_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
@@ -526,20 +531,22 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -547,59 +554,59 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
|
||||
if (max_shared_memory < partial_sm_accumulate_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, NOSM, false>,
|
||||
InputTorus, Torus, params, NOSM, false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, NOSM, false>,
|
||||
InputTorus, Torus, params, NOSM, false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, NOSM, true>,
|
||||
InputTorus, Torus, params, NOSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, NOSM, true>,
|
||||
InputTorus, Torus, params, NOSM, true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory < full_sm_accumulate_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, PARTIALSM, false>,
|
||||
InputTorus, Torus, params, PARTIALSM, false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm_accumulate_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, PARTIALSM, false>,
|
||||
InputTorus, Torus, params, PARTIALSM, false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, PARTIALSM, true>,
|
||||
InputTorus, Torus, params, PARTIALSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm_accumulate_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, PARTIALSM, true>,
|
||||
InputTorus, Torus, params, PARTIALSM, true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, FULLSM, false>,
|
||||
InputTorus, Torus, params, FULLSM, false>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_accumulate_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, FULLSM, false>,
|
||||
InputTorus, Torus, params, FULLSM, false>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, FULLSM, true>,
|
||||
InputTorus, Torus, params, FULLSM, true>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_accumulate_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, FULLSM, true>,
|
||||
InputTorus, Torus, params, FULLSM, true>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -644,7 +651,7 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
|
||||
level_count, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
@@ -655,9 +662,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void execute_compute_keybundle(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
|
||||
cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
|
||||
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
@@ -686,7 +693,8 @@ __host__ void execute_compute_keybundle(
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus, params,
|
||||
NOSM>
|
||||
<<<grid_keybundle, thds, 0, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
@@ -694,7 +702,8 @@ __host__ void execute_compute_keybundle(
|
||||
d_mem, full_sm_keybundle);
|
||||
} else {
|
||||
bool supports_tbc =
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<InputTorus,
|
||||
Torus>(
|
||||
num_samples, glwe_dimension, polynomial_size, level_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
@@ -703,20 +712,22 @@ __host__ void execute_compute_keybundle(
|
||||
dim3 thds_new_keybundle(512, 1, 1);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
InputTorus, Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM>,
|
||||
InputTorus, Torus, Degree<2048>, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
|
||||
Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
|
||||
3 * full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
|
||||
InputTorus, Torus, Degree<2048>, FULLSM>
|
||||
<<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
|
||||
stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
|
||||
bootstrapping_key, lwe_dimension, lwe_offset, chunk_size,
|
||||
keybundle_size_per_input);
|
||||
} else {
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, FULLSM>
|
||||
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
|
||||
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
|
||||
@@ -727,10 +738,10 @@ __host__ void execute_compute_keybundle(
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus, class params, bool is_first_iter>
|
||||
template <typename InputTorus, typename Torus, class params, bool is_first_iter>
|
||||
__host__ void execute_step_one(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
|
||||
@@ -754,7 +765,7 @@ __host__ void execute_step_one(
|
||||
|
||||
if (max_shared_memory < partial_sm_accumulate_step_one)
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, NOSM, is_first_iter>
|
||||
InputTorus, Torus, params, NOSM, is_first_iter>
|
||||
<<<grid_accumulate_step_one, thds, 0, stream>>>(
|
||||
lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
|
||||
global_accumulator, global_accumulator_fft, lwe_dimension,
|
||||
@@ -762,7 +773,7 @@ __host__ void execute_step_one(
|
||||
full_sm_accumulate_step_one);
|
||||
else if (max_shared_memory < full_sm_accumulate_step_one)
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, PARTIALSM, is_first_iter>
|
||||
InputTorus, Torus, params, PARTIALSM, is_first_iter>
|
||||
<<<grid_accumulate_step_one, thds, partial_sm_accumulate_step_one,
|
||||
stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
|
||||
lut_vector_indexes, global_accumulator,
|
||||
@@ -771,7 +782,7 @@ __host__ void execute_step_one(
|
||||
partial_sm_accumulate_step_one);
|
||||
else
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, FULLSM, is_first_iter>
|
||||
InputTorus, Torus, params, FULLSM, is_first_iter>
|
||||
<<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
|
||||
stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
|
||||
lut_vector_indexes, global_accumulator,
|
||||
@@ -823,11 +834,11 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void host_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
@@ -840,7 +851,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
execute_compute_keybundle<InputTorus, Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
@@ -853,12 +864,12 @@ __host__ void host_multi_bit_programmable_bootstrap(
|
||||
bool is_last_iter =
|
||||
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
|
||||
if (is_first_iter) {
|
||||
execute_step_one<Torus, params, true>(
|
||||
execute_step_one<InputTorus, Torus, params, true>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count);
|
||||
} else {
|
||||
execute_step_one<Torus, params, false>(
|
||||
execute_step_one<InputTorus, Torus, params, false>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, buffer, num_samples, lwe_dimension,
|
||||
glwe_dimension, polynomial_size, base_log, level_count);
|
||||
|
||||
@@ -98,9 +98,9 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
|
||||
|
||||
bool supports_cg =
|
||||
supports_cooperative_groups_on_multibit_programmable_bootstrap<
|
||||
__uint128_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
uint64_t, __uint128_t>(glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count,
|
||||
cuda_get_max_shared_memory(gpu_index));
|
||||
|
||||
if (supports_cg)
|
||||
return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(
|
||||
|
||||
@@ -35,12 +35,13 @@ namespace cg = cooperative_groups;
|
||||
*
|
||||
* Each y-block computes one element of the lwe_array_out.
|
||||
*/
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_tbc(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -83,7 +84,7 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
@@ -99,8 +100,8 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
constexpr auto log_modulus = params::log2_degree + 1;
|
||||
Torus b_hat = 0;
|
||||
Torus correction = 0;
|
||||
InputTorus b_hat = 0;
|
||||
InputTorus correction = 0;
|
||||
if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
|
||||
correction = centered_binary_modulus_switch_body_correction_to_add(
|
||||
block_lwe_array_in, lwe_dimension, log_modulus);
|
||||
@@ -117,7 +118,7 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
__syncthreads();
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = 0;
|
||||
InputTorus a_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[i], a_hat, log_modulus);
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
@@ -200,12 +201,13 @@ __global__ void device_programmable_bootstrap_tbc(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD>
|
||||
__global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
|
||||
uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride,
|
||||
@@ -247,7 +249,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
}
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
@@ -263,8 +265,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
constexpr auto log_modulus = params::log2_degree + 1;
|
||||
Torus b_hat = 0;
|
||||
Torus correction = 0;
|
||||
InputTorus b_hat = 0;
|
||||
InputTorus correction = 0;
|
||||
if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
|
||||
correction = centered_binary_modulus_switch_body_correction_to_add(
|
||||
block_lwe_array_in, lwe_dimension, log_modulus);
|
||||
@@ -276,7 +278,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
|
||||
false);
|
||||
Torus temp_a_hat = 0;
|
||||
InputTorus temp_a_hat = 0;
|
||||
for (int i = 0; i < lwe_dimension; i++) {
|
||||
|
||||
// We calculate the modulus switch of a warp size of elements
|
||||
@@ -382,7 +384,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
|
||||
cluster.sync();
|
||||
}
|
||||
|
||||
template <typename Torus, typename params>
|
||||
template <typename InputTorus, typename Torus, typename params>
|
||||
__host__ uint64_t scratch_programmable_bootstrap_tbc(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
|
||||
@@ -410,27 +412,27 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
|
||||
|
||||
if (max_shared_memory >= full_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm + minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -446,11 +448,11 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
|
||||
/*
|
||||
* Host wrapper
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void host_programmable_bootstrap_tbc(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
|
||||
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
|
||||
@@ -506,7 +508,8 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
config.dynamicSmemBytes = minimum_sm_tbc;
|
||||
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
&config,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
|
||||
@@ -515,7 +518,8 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
|
||||
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config, device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
&config,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
|
||||
@@ -530,18 +534,22 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
config.dynamicSmemBytes = full_sm_2_2;
|
||||
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_2_2));
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncAttributePreferredSharedMemoryCarveout,
|
||||
cudaSharedmemCarveoutMaxShared));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, num_many_lut, lut_stride, noise_reduction_type));
|
||||
@@ -549,7 +557,8 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
|
||||
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
&config,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
|
||||
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
|
||||
@@ -560,7 +569,7 @@ __host__ void host_programmable_bootstrap_tbc(
|
||||
}
|
||||
|
||||
// Verify if the grid size satisfies the cooperative group constraints
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
int glwe_dimension, int level_count, int num_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
@@ -585,18 +594,21 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
|
||||
if (max_shared_memory < partial_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_tbc<Torus, params, NOSM>, thds,
|
||||
0);
|
||||
(void *)
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
|
||||
thds, 0);
|
||||
} else if (max_shared_memory < full_sm) {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
(void *)device_programmable_bootstrap_tbc<InputTorus, Torus, params,
|
||||
PARTIALSM>,
|
||||
thds, partial_sm);
|
||||
} else {
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&max_active_blocks_per_sm,
|
||||
(void *)device_programmable_bootstrap_tbc<Torus, params, FULLSM>, thds,
|
||||
full_sm);
|
||||
(void *)device_programmable_bootstrap_tbc<InputTorus, Torus, params,
|
||||
FULLSM>,
|
||||
thds, full_sm);
|
||||
}
|
||||
|
||||
// Get the number of streaming multiprocessors
|
||||
@@ -621,7 +633,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
@@ -661,34 +673,39 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
|
||||
* disable cudaFuncAttributeNonPortableClusterSizeAllowed */
|
||||
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
&config));
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>, &config));
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
|
||||
&config));
|
||||
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
|
||||
&config));
|
||||
} else {
|
||||
if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
&config));
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>, &config));
|
||||
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
|
||||
&config));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -18,13 +18,14 @@
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
|
||||
Torus *global_accumulator, uint32_t lwe_dimension,
|
||||
@@ -67,7 +68,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
// The first dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
@@ -87,7 +88,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
InputTorus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
@@ -197,13 +198,14 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
//- Use a register based fft that uses the minimal synchronizations
|
||||
//- Register based fourier domain multiplication. Transfer fft's between blocks
|
||||
// instead of accumulator.
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
template <typename InputTorus, typename Torus, class params,
|
||||
sharedMemDegree SMD>
|
||||
__global__ void __launch_bounds__(params::degree / params::opt)
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params(
|
||||
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
|
||||
const Torus *__restrict__ lut_vector,
|
||||
const Torus *__restrict__ lut_vector_indexes,
|
||||
const Torus *__restrict__ lwe_array_in,
|
||||
const InputTorus *__restrict__ lwe_array_in,
|
||||
const Torus *__restrict__ lwe_input_indexes,
|
||||
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
|
||||
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset,
|
||||
@@ -247,7 +249,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
|
||||
// The first dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
const Torus *block_lwe_array_in =
|
||||
const InputTorus *block_lwe_array_in =
|
||||
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
|
||||
|
||||
const Torus *block_lut_vector =
|
||||
@@ -266,7 +268,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
|
||||
Torus reg_acc_rotated[params::opt];
|
||||
if (lwe_offset == 0) {
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = 0;
|
||||
InputTorus b_hat = 0;
|
||||
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
|
||||
params::log2_degree + 1);
|
||||
|
||||
@@ -400,7 +402,7 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
|
||||
return sizeof(Torus) * polynomial_size * 2; // accumulator
|
||||
}
|
||||
|
||||
template <typename Torus, typename params>
|
||||
template <typename InputTorus, typename Torus, typename params>
|
||||
__host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index,
|
||||
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
|
||||
@@ -430,20 +432,22 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
|
||||
params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
@@ -451,58 +455,58 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
if (max_shared_memory <
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory <
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_tbc_accumulate + 2 * minimum_sm_tbc_accumulate));
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributePreferredSharedMemoryCarveout,
|
||||
cudaSharedmemCarveoutMaxShared));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
}
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
|
||||
level_count, full_sm_keybundle);
|
||||
uint64_t size_tracker = 0;
|
||||
@@ -513,10 +517,10 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
return size_tracker;
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void execute_tbc_external_product_loop(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
|
||||
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
|
||||
@@ -579,8 +583,8 @@ __host__ void execute_tbc_external_product_loop(
|
||||
config.dynamicSmemBytes = minimum_dm;
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, NOSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
@@ -591,8 +595,8 @@ __host__ void execute_tbc_external_product_loop(
|
||||
config.dynamicSmemBytes = partial_dm + minimum_dm;
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
@@ -607,22 +611,22 @@ __host__ void execute_tbc_external_product_loop(
|
||||
config.dynamicSmemBytes = full_dm + 2 * minimum_dm;
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_dm + 2 * minimum_dm));
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributePreferredSharedMemoryCarveout,
|
||||
cudaSharedmemCarveoutMaxShared));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, lwe_offset, chunk_size,
|
||||
@@ -630,8 +634,8 @@ __host__ void execute_tbc_external_product_loop(
|
||||
} else {
|
||||
check_cuda_error(cudaLaunchKernelEx(
|
||||
&config,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
|
||||
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
|
||||
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
@@ -642,11 +646,11 @@ __host__ void execute_tbc_external_product_loop(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
|
||||
Torus const *lwe_output_indexes, Torus const *lut_vector,
|
||||
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
|
||||
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
|
||||
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
|
||||
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
|
||||
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
|
||||
@@ -659,13 +663,13 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
|
||||
lwe_offset += lwe_chunk_size) {
|
||||
|
||||
// Compute a keybundle
|
||||
execute_compute_keybundle<Torus, params>(
|
||||
execute_compute_keybundle<InputTorus, Torus, params>(
|
||||
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
|
||||
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
grouping_factor, level_count, lwe_offset);
|
||||
|
||||
// Accumulate
|
||||
execute_tbc_external_product_loop<Torus, params>(
|
||||
execute_tbc_external_product_loop<InputTorus, Torus, params>(
|
||||
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
|
||||
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
|
||||
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
|
||||
@@ -690,7 +694,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename InputTorus, typename Torus, class params>
|
||||
__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t level_count, uint32_t max_shared_memory) {
|
||||
@@ -732,45 +736,45 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
|
||||
if (max_shared_memory <
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, NOSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, NOSM>,
|
||||
&config));
|
||||
} else if (max_shared_memory <
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, PARTIALSM>,
|
||||
&config));
|
||||
} else {
|
||||
if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
|
||||
Torus, params, FULLSM>,
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
&config));
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
|
||||
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
|
||||
&cluster_size,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<
|
||||
InputTorus, Torus, params, FULLSM>,
|
||||
&config));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,14 +166,15 @@ public:
|
||||
#if CUDA_ARCH >= 900
|
||||
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
|
||||
if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t,
|
||||
uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
pbs_level, cuda_get_max_shared_memory(0))) {
|
||||
st.SkipWithError("Configuration not supported for tbc operation");
|
||||
return;
|
||||
}
|
||||
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
|
||||
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
|
||||
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
|
||||
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
|
||||
true);
|
||||
@@ -181,7 +182,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
uint32_t lut_stride = 0;
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
|
||||
uint64_t>(
|
||||
stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
|
||||
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
|
||||
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
|
||||
@@ -197,7 +199,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
|
||||
|
||||
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit_64(
|
||||
glwe_dimension, polynomial_size, pbs_level,
|
||||
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) {
|
||||
st.SkipWithError("Configuration not supported for fast operation");
|
||||
@@ -254,7 +256,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
|
||||
#if CUDA_ARCH >= 900
|
||||
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
(benchmark::State &st) {
|
||||
if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
|
||||
if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
|
||||
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
|
||||
pbs_level, cuda_get_max_shared_memory(0))) {
|
||||
st.SkipWithError("Configuration not supported for tbc operation");
|
||||
@@ -269,7 +271,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
|
||||
uint32_t lut_stride = 0;
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
|
||||
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
|
||||
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
|
||||
@@ -301,7 +303,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
|
||||
uint32_t lut_stride = 0;
|
||||
for (auto _ : st) {
|
||||
// Execute PBS
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
|
||||
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
|
||||
stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
|
||||
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
|
||||
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
|
||||
|
||||
@@ -187,7 +187,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
|
||||
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
|
||||
(lwe_dimension + 1));
|
||||
// Execute PBS
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
|
||||
stream, gpu_index, (void *)d_lwe_ct_out_array,
|
||||
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
|
||||
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
|
||||
|
||||
@@ -3020,6 +3020,20 @@ unsafe extern "C" {
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_programmable_bootstrap_32_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
buffer: *mut *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
noise_reduction_type: PBS_MS_REDUCTION_T,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_programmable_bootstrap_128(
|
||||
stream: *mut ffi::c_void,
|
||||
@@ -3035,7 +3049,7 @@ unsafe extern "C" {
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
@@ -3057,7 +3071,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
@@ -3110,6 +3124,7 @@ unsafe extern "C" {
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
#[link_name = "\u{1}_Z55has_support_to_cuda_programmable_bootstrap_cg_multi_bitjjjjj"]
|
||||
pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
@@ -3144,6 +3159,18 @@ unsafe extern "C" {
|
||||
grouping_factor: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_32_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
pbs_buffer: *mut *mut i8,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
level_count: u32,
|
||||
input_lwe_ciphertext_count: u32,
|
||||
allocate_gpu_memory: bool,
|
||||
) -> u64;
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
stream: *mut ffi::c_void,
|
||||
@@ -3179,6 +3206,29 @@ unsafe extern "C" {
|
||||
lut_stride: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
lwe_array_out: *mut ffi::c_void,
|
||||
lwe_output_indexes: *const ffi::c_void,
|
||||
lut_vector: *const ffi::c_void,
|
||||
lut_vector_indexes: *const ffi::c_void,
|
||||
lwe_array_in: *const ffi::c_void,
|
||||
lwe_input_indexes: *const ffi::c_void,
|
||||
bootstrapping_key: *const ffi::c_void,
|
||||
buffer: *mut i8,
|
||||
lwe_dimension: u32,
|
||||
glwe_dimension: u32,
|
||||
polynomial_size: u32,
|
||||
grouping_factor: u32,
|
||||
base_log: u32,
|
||||
level_count: u32,
|
||||
num_samples: u32,
|
||||
num_many_lut: u32,
|
||||
lut_stride: u32,
|
||||
);
|
||||
}
|
||||
unsafe extern "C" {
|
||||
pub fn cleanup_cuda_multi_bit_programmable_bootstrap(
|
||||
stream: *mut ffi::c_void,
|
||||
|
||||
@@ -415,7 +415,7 @@ mod cuda {
|
||||
let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
|
||||
let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
|
||||
let cpu_keys: CpuKeys<Scalar, KeyswitchScalar> = CpuKeysBuilder::new()
|
||||
.keyswitch_key(ksk_big_to_small)
|
||||
.build();
|
||||
|
||||
@@ -667,7 +667,8 @@ mod cuda {
|
||||
&mut encryption_generator,
|
||||
);
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
|
||||
let cpu_keys: CpuKeys<Scalar, Scalar> =
|
||||
CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
|
||||
|
||||
let bench_id;
|
||||
match get_bench_type() {
|
||||
|
||||
@@ -511,12 +511,19 @@ mod cuda {
|
||||
};
|
||||
use tfhe::core_crypto::prelude::*;
|
||||
|
||||
fn cuda_ks_pbs<Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize>(
|
||||
fn cuda_ks_pbs<
|
||||
Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize,
|
||||
KeyswitchScalar: UnsignedTorus + CastFrom<Scalar> + CastInto<usize>,
|
||||
>(
|
||||
c: &mut Criterion,
|
||||
parameters: &[(String, CryptoParametersRecord<Scalar>)],
|
||||
) {
|
||||
let bench_name = "core_crypto::cuda::ks_pbs";
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
let bench_name = format!(
|
||||
"core_crypto::cuda::ks_pbs::ks{}::pbs{}",
|
||||
KeyswitchScalar::BITS,
|
||||
Scalar::BITS
|
||||
);
|
||||
let mut bench_group = c.benchmark_group(bench_name.clone());
|
||||
bench_group
|
||||
.sample_size(10)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
@@ -535,29 +542,57 @@ mod cuda {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create the LweSecretKey
|
||||
let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
|
||||
params.lwe_dimension.unwrap(),
|
||||
&mut secret_generator,
|
||||
);
|
||||
let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
|
||||
let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
|
||||
DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
|
||||
DynamicDistribution::<KeyswitchScalar>::new_gaussian(
|
||||
gaussian_lwe_noise_distribution.standard_dev(),
|
||||
)
|
||||
}
|
||||
DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
|
||||
DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
|
||||
match KeyswitchScalar::BITS {
|
||||
32 => uniform_lwe_noise_distribution.bound_log2() - 32,
|
||||
64 => uniform_lwe_noise_distribution.bound_log2(),
|
||||
_ => panic!("Unsupported Keyswitch scalar input dtype"),
|
||||
},
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
|
||||
allocate_and_generate_new_binary_lwe_secret_key(
|
||||
params.lwe_dimension.unwrap(),
|
||||
&mut secret_generator,
|
||||
);
|
||||
|
||||
let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
|
||||
allocate_and_generate_new_binary_glwe_secret_key(
|
||||
params.glwe_dimension.unwrap(),
|
||||
params.polynomial_size.unwrap(),
|
||||
&mut secret_generator,
|
||||
);
|
||||
let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
|
||||
let big_lwe_sk = glwe_sk.into_lwe_secret_key();
|
||||
|
||||
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
|
||||
&output_lwe_secret_key,
|
||||
&input_lwe_secret_key,
|
||||
&big_lwe_sk,
|
||||
&lwe_sk,
|
||||
params.ks_base_log.unwrap(),
|
||||
params.ks_level.unwrap(),
|
||||
params.lwe_noise_distribution.unwrap(),
|
||||
lwe_noise_distribution_ksk,
|
||||
CiphertextModulus::new_native(),
|
||||
&mut encryption_generator,
|
||||
);
|
||||
|
||||
let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
|
||||
allocate_and_generate_new_binary_glwe_secret_key(
|
||||
params.glwe_dimension.unwrap(),
|
||||
params.polynomial_size.unwrap(),
|
||||
&mut secret_generator,
|
||||
);
|
||||
|
||||
let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
|
||||
let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
|
||||
|
||||
let bsk = LweBootstrapKey::new(
|
||||
Scalar::ZERO,
|
||||
params.glwe_dimension.unwrap().to_glwe_size(),
|
||||
@@ -568,7 +603,7 @@ mod cuda {
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
|
||||
let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new()
|
||||
.keyswitch_key(ksk_big_to_small)
|
||||
.bootstrap_key(bsk)
|
||||
.build();
|
||||
@@ -582,7 +617,7 @@ mod cuda {
|
||||
|
||||
// Allocate a new LweCiphertext and encrypt our plaintext
|
||||
let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&output_lwe_secret_key,
|
||||
&big_lwe_sk_64,
|
||||
Plaintext(Scalar::ZERO),
|
||||
params.lwe_noise_distribution.unwrap(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
@@ -591,10 +626,10 @@ mod cuda {
|
||||
let input_ks_ct_gpu =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams);
|
||||
|
||||
let output_ks_ct: LweCiphertextOwned<Scalar> = LweCiphertext::new(
|
||||
Scalar::ZERO,
|
||||
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
let output_ks_ct: LweCiphertextOwned<KeyswitchScalar> = LweCiphertext::new(
|
||||
KeyswitchScalar::ZERO,
|
||||
lwe_sk.lwe_dimension().to_lwe_size(),
|
||||
ciphertext_modulus_out,
|
||||
);
|
||||
let mut output_ks_ct_gpu =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams);
|
||||
@@ -611,7 +646,7 @@ mod cuda {
|
||||
// Allocate the LweCiphertext to store the result of the PBS
|
||||
let output_pbs_ct = LweCiphertext::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
let mut output_pbs_ct_gpu =
|
||||
@@ -671,12 +706,12 @@ mod cuda {
|
||||
.map(|i| {
|
||||
let mut input_ks_list = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
encrypt_lwe_ciphertext_list(
|
||||
&output_lwe_secret_key,
|
||||
&big_lwe_sk_64,
|
||||
&mut input_ks_list,
|
||||
&plaintext_list,
|
||||
params.lwe_noise_distribution.unwrap(),
|
||||
@@ -692,10 +727,10 @@ mod cuda {
|
||||
let output_ks_cts = (0..gpu_count)
|
||||
.map(|i| {
|
||||
let output_ks_list = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
KeyswitchScalar::ZERO,
|
||||
lwe_sk.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
ciphertext_modulus_out,
|
||||
);
|
||||
CudaLweCiphertextList::from_lwe_ciphertext_list(
|
||||
&output_ks_list,
|
||||
@@ -724,7 +759,7 @@ mod cuda {
|
||||
.map(|i| {
|
||||
let output_pbs_ct = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
@@ -822,12 +857,18 @@ mod cuda {
|
||||
|
||||
fn cuda_multi_bit_ks_pbs<
|
||||
Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Default + Serialize + Sync,
|
||||
KeyswitchScalar: UnsignedTorus + CastFrom<Scalar> + CastInto<usize>,
|
||||
>(
|
||||
c: &mut Criterion,
|
||||
parameters: &[(String, CryptoParametersRecord<Scalar>, LweBskGroupingFactor)],
|
||||
) {
|
||||
let bench_name = "core_crypto::cuda::multi_bit_ks_pbs";
|
||||
let mut bench_group = c.benchmark_group(bench_name);
|
||||
let bench_name = format!(
|
||||
"core_crypto::cuda::multi_bit_ks_pbs::ks_pbs::ks{}::pbs{}",
|
||||
KeyswitchScalar::BITS,
|
||||
Scalar::BITS
|
||||
);
|
||||
|
||||
let mut bench_group = c.benchmark_group(bench_name.clone());
|
||||
bench_group
|
||||
.sample_size(10)
|
||||
.measurement_time(std::time::Duration::from_secs(30));
|
||||
@@ -846,29 +887,57 @@ mod cuda {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create the LweSecretKey
|
||||
let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
|
||||
params.lwe_dimension.unwrap(),
|
||||
&mut secret_generator,
|
||||
);
|
||||
let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
|
||||
let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
|
||||
DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
|
||||
DynamicDistribution::<KeyswitchScalar>::new_gaussian(
|
||||
gaussian_lwe_noise_distribution.standard_dev(),
|
||||
)
|
||||
}
|
||||
DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
|
||||
DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
|
||||
match KeyswitchScalar::BITS {
|
||||
32 => uniform_lwe_noise_distribution.bound_log2() - 32,
|
||||
64 => uniform_lwe_noise_distribution.bound_log2(),
|
||||
_ => panic!("Unsupported Keyswitch scalar input dtype"),
|
||||
},
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
|
||||
allocate_and_generate_new_binary_lwe_secret_key(
|
||||
params.lwe_dimension.unwrap(),
|
||||
&mut secret_generator,
|
||||
);
|
||||
|
||||
let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
|
||||
allocate_and_generate_new_binary_glwe_secret_key(
|
||||
params.glwe_dimension.unwrap(),
|
||||
params.polynomial_size.unwrap(),
|
||||
&mut secret_generator,
|
||||
);
|
||||
let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
|
||||
let big_lwe_sk = glwe_sk.into_lwe_secret_key();
|
||||
|
||||
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
|
||||
&output_lwe_secret_key,
|
||||
&input_lwe_secret_key,
|
||||
&big_lwe_sk,
|
||||
&lwe_sk,
|
||||
params.ks_base_log.unwrap(),
|
||||
params.ks_level.unwrap(),
|
||||
params.lwe_noise_distribution.unwrap(),
|
||||
lwe_noise_distribution_ksk,
|
||||
CiphertextModulus::new_native(),
|
||||
&mut encryption_generator,
|
||||
);
|
||||
|
||||
let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
|
||||
allocate_and_generate_new_binary_glwe_secret_key(
|
||||
params.glwe_dimension.unwrap(),
|
||||
params.polynomial_size.unwrap(),
|
||||
&mut secret_generator,
|
||||
);
|
||||
|
||||
let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
|
||||
let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
|
||||
|
||||
let multi_bit_bsk = LweMultiBitBootstrapKey::new(
|
||||
Scalar::ZERO,
|
||||
params.glwe_dimension.unwrap().to_glwe_size(),
|
||||
@@ -880,7 +949,7 @@ mod cuda {
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
|
||||
let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new()
|
||||
.keyswitch_key(ksk_big_to_small)
|
||||
.multi_bit_bootstrap_key(multi_bit_bsk)
|
||||
.build();
|
||||
@@ -894,7 +963,7 @@ mod cuda {
|
||||
|
||||
// Allocate a new LweCiphertext and encrypt our plaintext
|
||||
let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&output_lwe_secret_key,
|
||||
&big_lwe_sk_64,
|
||||
Plaintext(Scalar::ZERO),
|
||||
params.lwe_noise_distribution.unwrap(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
@@ -903,10 +972,10 @@ mod cuda {
|
||||
let input_ks_ct_gpu =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams);
|
||||
|
||||
let output_ks_ct: LweCiphertextOwned<Scalar> = LweCiphertext::new(
|
||||
Scalar::ZERO,
|
||||
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
let output_ks_ct: LweCiphertextOwned<KeyswitchScalar> = LweCiphertext::new(
|
||||
KeyswitchScalar::ZERO,
|
||||
lwe_sk.lwe_dimension().to_lwe_size(),
|
||||
ciphertext_modulus_out,
|
||||
);
|
||||
let mut output_ks_ct_gpu =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams);
|
||||
@@ -923,7 +992,7 @@ mod cuda {
|
||||
// Allocate the LweCiphertext to store the result of the PBS
|
||||
let output_pbs_ct = LweCiphertext::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
let mut output_pbs_ct_gpu =
|
||||
@@ -981,12 +1050,12 @@ mod cuda {
|
||||
.map(|i| {
|
||||
let mut input_ks_list = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
encrypt_lwe_ciphertext_list(
|
||||
&output_lwe_secret_key,
|
||||
&big_lwe_sk_64,
|
||||
&mut input_ks_list,
|
||||
&plaintext_list,
|
||||
params.lwe_noise_distribution.unwrap(),
|
||||
@@ -1002,10 +1071,10 @@ mod cuda {
|
||||
let output_ks_cts = (0..gpu_count)
|
||||
.map(|i| {
|
||||
let output_ks_list = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
KeyswitchScalar::ZERO,
|
||||
lwe_sk.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
ciphertext_modulus_out,
|
||||
);
|
||||
CudaLweCiphertextList::from_lwe_ciphertext_list(
|
||||
&output_ks_list,
|
||||
@@ -1034,7 +1103,7 @@ mod cuda {
|
||||
.map(|i| {
|
||||
let output_pbs_ct = LweCiphertextList::new(
|
||||
Scalar::ZERO,
|
||||
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
|
||||
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
|
||||
LweCiphertextCount(elements_per_stream),
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
@@ -1132,12 +1201,17 @@ mod cuda {
|
||||
|
||||
pub fn cuda_ks_pbs_group() {
|
||||
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
|
||||
cuda_ks_pbs(&mut criterion, &benchmark_parameters());
|
||||
cuda_ks_pbs::<u64, u32>(&mut criterion, &benchmark_parameters());
|
||||
cuda_ks_pbs::<u64, u64>(&mut criterion, &benchmark_parameters());
|
||||
}
|
||||
|
||||
pub fn cuda_multi_bit_ks_pbs_group() {
|
||||
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
|
||||
cuda_multi_bit_ks_pbs(
|
||||
cuda_multi_bit_ks_pbs::<u64, u32>(
|
||||
&mut criterion,
|
||||
&multi_bit_benchmark_parameters_with_grouping(),
|
||||
);
|
||||
cuda_multi_bit_ks_pbs::<u64, u64>(
|
||||
&mut criterion,
|
||||
&multi_bit_benchmark_parameters_with_grouping(),
|
||||
);
|
||||
|
||||
@@ -250,7 +250,7 @@ mod cuda {
|
||||
}
|
||||
};
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
|
||||
let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
|
||||
|
||||
let message_modulus: u64 = 1 << 4;
|
||||
let input_message: u64 = 3;
|
||||
@@ -493,7 +493,7 @@ mod cuda {
|
||||
squash_params.ciphertext_modulus,
|
||||
);
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
|
||||
let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new()
|
||||
.multi_bit_bootstrap_key(multi_bit_bsk)
|
||||
.build();
|
||||
|
||||
|
||||
@@ -978,7 +978,8 @@ mod cuda {
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
|
||||
let cpu_keys: CpuKeys<Scalar, Scalar> =
|
||||
CpuKeysBuilder::new().bootstrap_key(bsk).build();
|
||||
|
||||
let bench_id;
|
||||
|
||||
@@ -1236,7 +1237,7 @@ mod cuda {
|
||||
params.ciphertext_modulus.unwrap(),
|
||||
);
|
||||
|
||||
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
|
||||
let cpu_keys: CpuKeys<Scalar, Scalar> = CpuKeysBuilder::new()
|
||||
.multi_bit_bootstrap_key(multi_bit_bsk)
|
||||
.build();
|
||||
|
||||
|
||||
@@ -575,28 +575,28 @@ mod cuda_utils {
|
||||
}
|
||||
|
||||
/// Computing keys in their CPU flavor.
|
||||
pub struct CpuKeys<T: UnsignedInteger> {
|
||||
ksk: Option<LweKeyswitchKeyOwned<T>>,
|
||||
pub struct CpuKeys<T: UnsignedInteger, KST: UnsignedInteger> {
|
||||
ksk: Option<LweKeyswitchKeyOwned<KST>>,
|
||||
pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
|
||||
bsk: Option<LweBootstrapKeyOwned<T>>,
|
||||
multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
|
||||
}
|
||||
|
||||
impl<T: UnsignedInteger> CpuKeys<T> {
|
||||
pub fn builder() -> CpuKeysBuilder<T> {
|
||||
impl<T: UnsignedInteger, KST: UnsignedInteger> CpuKeys<T, KST> {
|
||||
pub fn builder() -> CpuKeysBuilder<T, KST> {
|
||||
CpuKeysBuilder::new()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct CpuKeysBuilder<T: UnsignedInteger> {
|
||||
ksk: Option<LweKeyswitchKeyOwned<T>>,
|
||||
pub struct CpuKeysBuilder<T: UnsignedInteger, KST: UnsignedInteger> {
|
||||
ksk: Option<LweKeyswitchKeyOwned<KST>>,
|
||||
pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
|
||||
bsk: Option<LweBootstrapKeyOwned<T>>,
|
||||
multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
|
||||
}
|
||||
|
||||
impl<T: UnsignedInteger> CpuKeysBuilder<T> {
|
||||
pub fn new() -> CpuKeysBuilder<T> {
|
||||
impl<T: UnsignedInteger, KST: UnsignedInteger> CpuKeysBuilder<T, KST> {
|
||||
pub fn new() -> CpuKeysBuilder<T, KST> {
|
||||
Self {
|
||||
ksk: None,
|
||||
pksk: None,
|
||||
@@ -605,7 +605,7 @@ mod cuda_utils {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<T>) -> CpuKeysBuilder<T> {
|
||||
pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<KST>) -> CpuKeysBuilder<T, KST> {
|
||||
self.ksk = Some(ksk);
|
||||
self
|
||||
}
|
||||
@@ -613,12 +613,12 @@ mod cuda_utils {
|
||||
pub fn packing_keyswitch_key(
|
||||
mut self,
|
||||
pksk: LwePackingKeyswitchKeyOwned<T>,
|
||||
) -> CpuKeysBuilder<T> {
|
||||
) -> CpuKeysBuilder<T, KST> {
|
||||
self.pksk = Some(pksk);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T> {
|
||||
pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T, KST> {
|
||||
self.bsk = Some(bsk);
|
||||
self
|
||||
}
|
||||
@@ -626,12 +626,12 @@ mod cuda_utils {
|
||||
pub fn multi_bit_bootstrap_key(
|
||||
mut self,
|
||||
mb_bsk: LweMultiBitBootstrapKeyOwned<T>,
|
||||
) -> CpuKeysBuilder<T> {
|
||||
) -> CpuKeysBuilder<T, KST> {
|
||||
self.multi_bit_bsk = Some(mb_bsk);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build(self) -> CpuKeys<T> {
|
||||
pub fn build(self) -> CpuKeys<T, KST> {
|
||||
CpuKeys {
|
||||
ksk: self.ksk,
|
||||
pksk: self.pksk,
|
||||
@@ -640,7 +640,7 @@ mod cuda_utils {
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<T: UnsignedInteger> Default for CpuKeysBuilder<T> {
|
||||
impl<T: UnsignedInteger, KST: UnsignedInteger> Default for CpuKeysBuilder<T, KST> {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
@@ -648,17 +648,17 @@ mod cuda_utils {
|
||||
|
||||
/// Computing keys in their Cuda flavor.
|
||||
#[allow(dead_code)]
|
||||
pub struct CudaLocalKeys<T: UnsignedInteger> {
|
||||
pub ksk: Option<CudaLweKeyswitchKey<T>>,
|
||||
pub struct CudaLocalKeys<T: UnsignedInteger, KST: UnsignedInteger> {
|
||||
pub ksk: Option<CudaLweKeyswitchKey<KST>>,
|
||||
pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
|
||||
pub bsk: Option<CudaLweBootstrapKey>,
|
||||
pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl<T: UnsignedInteger> CudaLocalKeys<T> {
|
||||
impl<T: UnsignedInteger, KST: UnsignedInteger> CudaLocalKeys<T, KST> {
|
||||
pub fn from_cpu_keys(
|
||||
cpu_keys: &CpuKeys<T>,
|
||||
cpu_keys: &CpuKeys<T, KST>,
|
||||
ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
stream: &CudaStreams,
|
||||
) -> Self {
|
||||
@@ -681,10 +681,10 @@ mod cuda_utils {
|
||||
}
|
||||
|
||||
/// Instantiate Cuda computing keys to each available GPU.
|
||||
pub fn cuda_local_keys_core<T: UnsignedInteger>(
|
||||
cpu_keys: &CpuKeys<T>,
|
||||
pub fn cuda_local_keys_core<T: UnsignedInteger, KST: UnsignedInteger>(
|
||||
cpu_keys: &CpuKeys<T, KST>,
|
||||
ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
|
||||
) -> Vec<CudaLocalKeys<T>> {
|
||||
) -> Vec<CudaLocalKeys<T, KST>> {
|
||||
let gpu_count = get_number_of_gpus() as usize;
|
||||
let mut gpu_keys_vec = Vec::with_capacity(gpu_count);
|
||||
for i in 0..gpu_count {
|
||||
|
||||
@@ -10,8 +10,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
|
||||
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until streams is synchronised
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<InputScalar, Scalar>(
|
||||
input: &CudaLweCiphertextList<InputScalar>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
@@ -20,6 +20,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
InputScalar: UnsignedTorus + CastInto<usize>,
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
@@ -59,21 +60,22 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
|
||||
multi_bit_bsk.polynomial_size(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
input.ciphertext_modulus(),
|
||||
output.ciphertext_modulus(),
|
||||
assert!(
|
||||
input.ciphertext_modulus().associated_scalar_bits()
|
||||
<= output.ciphertext_modulus().associated_scalar_bits(),
|
||||
"Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
|
||||
input.ciphertext_modulus(),
|
||||
output.ciphertext_modulus(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
input.ciphertext_modulus(),
|
||||
output.ciphertext_modulus(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
"Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
|
||||
"Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
|
||||
input.ciphertext_modulus(),
|
||||
accumulator.ciphertext_modulus(),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
streams.gpu_indexes[0],
|
||||
multi_bit_bsk.d_vec.gpu_index(0),
|
||||
@@ -144,8 +146,8 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<InputScalar, Scalar>(
|
||||
input: &CudaLweCiphertextList<InputScalar>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
@@ -154,6 +156,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
InputScalar: UnsignedTorus + CastInto<usize>,
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
|
||||
@@ -12,8 +12,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
|
||||
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
|
||||
/// be dropped until streams is synchronised
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<InputScalar, Scalar>(
|
||||
input: &CudaLweCiphertextList<InputScalar>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
@@ -22,6 +22,7 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
|
||||
bsk: &CudaLweBootstrapKey,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
InputScalar: UnsignedTorus + CastInto<usize>,
|
||||
// CastInto required for PBS modulus switch which returns a usize
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
@@ -259,8 +260,8 @@ pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
input: &CudaLweCiphertextList<Scalar>,
|
||||
pub fn cuda_programmable_bootstrap_lwe_ciphertext<InputScalar, Scalar>(
|
||||
input: &CudaLweCiphertextList<InputScalar>,
|
||||
output: &mut CudaLweCiphertextList<Scalar>,
|
||||
accumulator: &CudaGlweCiphertextList<Scalar>,
|
||||
lut_indexes: &CudaVec<Scalar>,
|
||||
@@ -269,6 +270,7 @@ pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
|
||||
bsk: &CudaLweBootstrapKey,
|
||||
streams: &CudaStreams,
|
||||
) where
|
||||
InputScalar: UnsignedTorus + CastInto<usize>,
|
||||
Scalar: UnsignedTorus + CastInto<usize>,
|
||||
{
|
||||
unsafe {
|
||||
|
||||
@@ -4,8 +4,11 @@ use crate::core_crypto::gpu::lwe_bootstrap_key::{
|
||||
CudaLweBootstrapKey, CudaModulusSwitchNoiseReductionConfiguration,
|
||||
};
|
||||
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
|
||||
use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
|
||||
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
|
||||
use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
|
||||
use crate::core_crypto::gpu::{
|
||||
cuda_keyswitch_lwe_ciphertext, cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams,
|
||||
};
|
||||
use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod;
|
||||
use itertools::Itertools;
|
||||
|
||||
@@ -174,6 +177,218 @@ fn lwe_encrypt_pbs_decrypt<
|
||||
}
|
||||
}
|
||||
|
||||
fn lwe_ks_encrypt_pbs_decrypt_compare_32_64<
|
||||
Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + CastFrom<u32> + CastInto<u32>,
|
||||
>(
|
||||
params: ClassicTestParams<Scalar>,
|
||||
) {
|
||||
assert!(Scalar::BITS <= 64);
|
||||
|
||||
let input_lwe_dimension = params.lwe_dimension;
|
||||
let lwe_noise_distribution = params.lwe_noise_distribution;
|
||||
let glwe_noise_distribution = params.glwe_noise_distribution;
|
||||
let ciphertext_modulus = params.ciphertext_modulus;
|
||||
let message_modulus_log = params.message_modulus_log;
|
||||
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
|
||||
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
|
||||
let glwe_dimension = params.glwe_dimension;
|
||||
let polynomial_size = params.polynomial_size;
|
||||
let decomp_base_log = params.pbs_base_log;
|
||||
let decomp_level_count = params.pbs_level;
|
||||
|
||||
let gpu_index = 0;
|
||||
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
|
||||
|
||||
let mut rsc = TestResources::new();
|
||||
|
||||
let f = |x: Scalar| {
|
||||
x.wrapping_mul(Scalar::TWO)
|
||||
.wrapping_sub(Scalar::ONE)
|
||||
.wrapping_rem(msg_modulus)
|
||||
};
|
||||
|
||||
let delta: Scalar = encoding_with_padding / msg_modulus;
|
||||
let mut msg = msg_modulus;
|
||||
const NB_TESTS: usize = 10;
|
||||
let number_of_messages = 1;
|
||||
|
||||
let accumulator = generate_programmable_bootstrap_glwe_lut(
|
||||
polynomial_size,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
msg_modulus.cast_into(),
|
||||
ciphertext_modulus,
|
||||
delta,
|
||||
f,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&accumulator,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let lwe_noise_distribution_ksk_32 = match params.lwe_noise_distribution {
|
||||
DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
|
||||
DynamicDistribution::<u32>::new_gaussian(gaussian_lwe_noise_distribution.standard_dev())
|
||||
}
|
||||
DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
|
||||
DynamicDistribution::<u32>::new_t_uniform(
|
||||
uniform_lwe_noise_distribution.bound_log2() - 32,
|
||||
)
|
||||
}
|
||||
};
|
||||
|
||||
// Create the LweSecretKey
|
||||
let intermediate_lwe_secret_key_32: LweSecretKeyOwned<u32> =
|
||||
allocate_and_generate_new_binary_lwe_secret_key(
|
||||
input_lwe_dimension,
|
||||
&mut rsc.secret_random_generator,
|
||||
); // lwe_sk
|
||||
|
||||
let glwe_sk_64: GlweSecretKeyOwned<Scalar> = allocate_and_generate_new_binary_glwe_secret_key(
|
||||
params.glwe_dimension,
|
||||
params.polynomial_size,
|
||||
&mut rsc.secret_random_generator,
|
||||
);
|
||||
|
||||
let big_lwe_sk_64 = glwe_sk_64.clone().into_lwe_secret_key();
|
||||
|
||||
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
|
||||
&big_lwe_sk_64,
|
||||
&intermediate_lwe_secret_key_32,
|
||||
params.ks_base_log,
|
||||
params.ks_level,
|
||||
lwe_noise_distribution_ksk_32,
|
||||
CiphertextModulus::<u32>::new_native(),
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
let d_ksk_big_to_small =
|
||||
CudaLweKeyswitchKey::from_lwe_keyswitch_key(&ksk_big_to_small, &stream);
|
||||
|
||||
let ciphertext_modulus_ks32 = CiphertextModulus::<u32>::new_native();
|
||||
|
||||
let output_lwe_dimension = big_lwe_sk_64.lwe_dimension();
|
||||
|
||||
let mut bsk = LweBootstrapKey::new(
|
||||
Scalar::ZERO,
|
||||
glwe_dimension.to_glwe_size(),
|
||||
polynomial_size,
|
||||
decomp_base_log,
|
||||
decomp_level_count,
|
||||
input_lwe_dimension,
|
||||
ciphertext_modulus,
|
||||
);
|
||||
|
||||
par_generate_lwe_bootstrap_key(
|
||||
&intermediate_lwe_secret_key_32,
|
||||
&glwe_sk_64,
|
||||
&mut bsk,
|
||||
glwe_noise_distribution,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&*bsk,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
|
||||
|
||||
while msg != Scalar::ZERO {
|
||||
msg = msg.wrapping_sub(Scalar::ONE);
|
||||
for _ in 0..NB_TESTS {
|
||||
let plaintext = Plaintext(msg * delta);
|
||||
|
||||
let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
|
||||
&big_lwe_sk_64,
|
||||
plaintext,
|
||||
lwe_noise_distribution,
|
||||
ciphertext_modulus,
|
||||
&mut rsc.encryption_random_generator,
|
||||
);
|
||||
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&lwe_ciphertext_in,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let d_lwe_ciphertext_in =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
|
||||
|
||||
let output_ks_ct: LweCiphertextOwned<u32> = LweCiphertext::new(
|
||||
0u32,
|
||||
intermediate_lwe_secret_key_32.lwe_dimension().to_lwe_size(),
|
||||
ciphertext_modulus_ks32,
|
||||
);
|
||||
let mut output_ks_ct_gpu =
|
||||
CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &stream);
|
||||
|
||||
let mut d_out_pbs_ct = CudaLweCiphertextList::new(
|
||||
output_lwe_dimension,
|
||||
LweCiphertextCount(1),
|
||||
ciphertext_modulus,
|
||||
&stream,
|
||||
);
|
||||
|
||||
let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
|
||||
|
||||
let test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
|
||||
|
||||
let mut d_test_vector_indexes =
|
||||
unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
|
||||
unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
|
||||
|
||||
let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
|
||||
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
|
||||
let lwe_indexes = lwe_indexes_usize
|
||||
.iter()
|
||||
.map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
|
||||
.collect_vec();
|
||||
let mut d_output_indexes =
|
||||
unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
|
||||
let mut d_input_indexes =
|
||||
unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
|
||||
unsafe {
|
||||
d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
|
||||
d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
|
||||
}
|
||||
|
||||
cuda_keyswitch_lwe_ciphertext(
|
||||
&d_ksk_big_to_small,
|
||||
&d_lwe_ciphertext_in,
|
||||
&mut output_ks_ct_gpu,
|
||||
&d_input_indexes,
|
||||
&d_output_indexes,
|
||||
true,
|
||||
&stream,
|
||||
false,
|
||||
);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext(
|
||||
&output_ks_ct_gpu,
|
||||
&mut d_out_pbs_ct,
|
||||
&d_accumulator,
|
||||
&d_test_vector_indexes,
|
||||
&d_output_indexes,
|
||||
&d_input_indexes,
|
||||
&d_bsk,
|
||||
&stream,
|
||||
);
|
||||
|
||||
let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
|
||||
assert!(check_encrypted_content_respects_mod(
|
||||
&out_pbs_ct,
|
||||
ciphertext_modulus
|
||||
));
|
||||
|
||||
let decrypted = decrypt_lwe_ciphertext(&big_lwe_sk_64, &out_pbs_ct);
|
||||
|
||||
let decoded = round_decode(decrypted.0, delta) % msg_modulus;
|
||||
|
||||
assert_eq!(decoded, f(msg));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn lwe_encrypt_centered_ms_pbs_decrypt<
|
||||
Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize>,
|
||||
>(
|
||||
@@ -327,6 +542,6 @@ fn lwe_encrypt_centered_ms_pbs_decrypt<
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
create_gpu_parameterized_test!(lwe_ks_encrypt_pbs_decrypt_compare_32_64);
|
||||
create_gpu_parameterized_test!(lwe_encrypt_pbs_decrypt);
|
||||
create_gpu_parameterized_test!(lwe_encrypt_centered_ms_pbs_decrypt);
|
||||
|
||||
@@ -166,13 +166,13 @@ impl Drop for CudaStreams {
|
||||
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
|
||||
/// required
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
|
||||
pub unsafe fn programmable_bootstrap_async<KST: UnsignedInteger, T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
lwe_array_out: &mut CudaVec<T>,
|
||||
lwe_out_indexes: &CudaVec<T>,
|
||||
test_vector: &CudaVec<T>,
|
||||
test_vector_indexes: &CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<KST>,
|
||||
lwe_in_indexes: &CudaVec<T>,
|
||||
bootstrapping_key: &CudaVec<f64>,
|
||||
lwe_dimension: LweDimension,
|
||||
@@ -193,45 +193,89 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
|
||||
PBSMSNoiseReductionType::Centered
|
||||
});
|
||||
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
noise_reduction_type as u32,
|
||||
);
|
||||
if KST::BITS == 32 {
|
||||
scratch_cuda_programmable_bootstrap_32_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
noise_reduction_type as u32,
|
||||
);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_out_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
lwe_in_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_out_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
lwe_in_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
|
||||
cleanup_cuda_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
cleanup_cuda_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
} else if KST::BITS == 64 {
|
||||
scratch_cuda_programmable_bootstrap_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
noise_reduction_type as u32,
|
||||
);
|
||||
|
||||
cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
lwe_out_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
lwe_in_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
|
||||
cleanup_cuda_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
} else {
|
||||
panic!("Unsupported PBS input bitwidth {}", KST::BITS);
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -348,6 +392,7 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn programmable_bootstrap_multi_bit_async<
|
||||
T: UnsignedInteger,
|
||||
KST: UnsignedInteger,
|
||||
B: Any + UnsignedInteger,
|
||||
>(
|
||||
streams: &CudaStreams,
|
||||
@@ -355,7 +400,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
|
||||
output_indexes: &CudaVec<T>,
|
||||
test_vector: &CudaVec<B>,
|
||||
test_vector_indexes: &CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<T>,
|
||||
lwe_array_in: &CudaVec<KST>,
|
||||
input_indexes: &CudaVec<T>,
|
||||
bootstrapping_key: &CudaVec<B>,
|
||||
lwe_dimension: LweDimension,
|
||||
@@ -406,42 +451,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
} else if TypeId::of::<B>() == TypeId::of::<u64>() {
|
||||
scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
);
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
output_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
input_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
if TypeId::of::<KST>() == TypeId::of::<u32>() {
|
||||
scratch_cuda_multi_bit_programmable_bootstrap_32_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
);
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
output_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
input_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
} else if TypeId::of::<KST>() == TypeId::of::<u64>() {
|
||||
scratch_cuda_multi_bit_programmable_bootstrap_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
true,
|
||||
);
|
||||
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
lwe_array_out.as_mut_c_ptr(0),
|
||||
output_indexes.as_c_ptr(0),
|
||||
test_vector.as_c_ptr(0),
|
||||
test_vector_indexes.as_c_ptr(0),
|
||||
lwe_array_in.as_c_ptr(0),
|
||||
input_indexes.as_c_ptr(0),
|
||||
bootstrapping_key.as_c_ptr(0),
|
||||
pbs_buffer,
|
||||
lwe_dimension.0 as u32,
|
||||
glwe_dimension.0 as u32,
|
||||
polynomial_size.0 as u32,
|
||||
grouping_factor.0 as u32,
|
||||
base_log.0 as u32,
|
||||
level.0 as u32,
|
||||
num_samples,
|
||||
num_many_lut,
|
||||
lut_stride,
|
||||
);
|
||||
cleanup_cuda_multi_bit_programmable_bootstrap(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0].get(),
|
||||
std::ptr::addr_of_mut!(pbs_buffer),
|
||||
);
|
||||
} else {
|
||||
panic!("Unsupported MB PBS input torus size");
|
||||
}
|
||||
} else {
|
||||
panic!("Unsupported torus size")
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user