feat(gpu): support keyswitch 64/32 in PBS

This commit is contained in:
Andrei Stoian
2025-10-10 17:25:03 +02:00
parent 78d1ce18c1
commit e43528db71
27 changed files with 1446 additions and 776 deletions

View File

@@ -7,23 +7,23 @@ template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size, uint32_t max_shared_memory);
template <typename Torus>
template <typename InputTorus, typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -48,7 +48,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride);
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -96,7 +96,7 @@ template <typename Torus>
uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
uint32_t polynomial_size);
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t level_count, uint64_t full_sm_keybundle);

View File

@@ -428,29 +428,29 @@ uint64_t get_buffer_size_programmable_bootstrap_cg(
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus>
template <typename InputTorus, typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory);
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -458,11 +458,11 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
uint32_t lut_stride);
#if (CUDA_ARCH >= 900)
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -477,14 +477,14 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
#endif
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,

View File

@@ -69,13 +69,19 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_programmable_bootstrap_32_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
uint64_t scratch_cuda_programmable_bootstrap_128(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
@@ -84,7 +90,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride);
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,

View File

@@ -4,12 +4,12 @@
#include "pbs_enums.h"
#include "stdint.h"
extern "C" {
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
extern "C" {
void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
@@ -20,6 +20,11 @@ void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_128(
uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
uint32_t polynomial_size, uint32_t grouping_factor);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -35,6 +40,16 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
uint32_t gpu_index,
int8_t **pbs_buffer);

View File

@@ -535,7 +535,6 @@ __host__ void host_gemm_keyswitch_lwe_ciphertext_vector(
dim3 grid_negate(CEIL_DIV(lwe_dimension_out + 1, BLOCK_SIZE_DECOMP),
CEIL_DIV(num_samples, BLOCK_SIZE_DECOMP));
dim3 threads_negate(BLOCK_SIZE_DECOMP, BLOCK_SIZE_DECOMP);
// Negate all outputs in the output LWEs. This is the final step in the GEMM
// keyswitch computed as: -(-b + sum(a_i A_KSK))
keyswitch_negate_with_output_indices<Torus, KSTorus>

View File

@@ -263,42 +263,7 @@ void execute_pbs_async(CudaStreams streams,
uint32_t num_many_lut, uint32_t lut_stride) {
if constexpr (std::is_same_v<OutputTorus, uint32_t>) {
// 32 bits
switch (pbs_type) {
case MULTI_BIT:
PANIC("Error: 32-bit multibit PBS is not supported.\n")
case CLASSICAL:
for (uint i = 0; i < streams.count(); i++) {
int num_inputs_on_gpu = get_num_inputs_on_gpu(
input_lwe_ciphertext_count, i, streams.count());
int gpu_offset =
get_gpu_offset(input_lwe_ciphertext_count, i, streams.count());
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
// Use the macro to get the correct elements for the current iteration
// Handles the case when the input/output are scattered through
// different gpus and when it is not
auto current_lwe_array_out = get_variant_element(lwe_array_out, i);
auto current_lwe_output_indexes =
get_variant_element(lwe_output_indexes, i);
auto current_lwe_array_in = get_variant_element(lwe_array_in, i);
auto current_lwe_input_indexes =
get_variant_element(lwe_input_indexes, i);
cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,
bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, num_inputs_on_gpu,
num_many_lut, lut_stride);
}
break;
default:
PANIC("Error: unsupported cuda PBS type.")
}
PANIC("Error: unsupported 32b CUDA PBS type.")
} else if constexpr (std::is_same_v<OutputTorus, uint64_t>) {
// 64 bits
switch (pbs_type) {
@@ -353,7 +318,7 @@ void execute_pbs_async(CudaStreams streams,
auto d_lut_vector_indexes =
lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
streams.stream(i), streams.gpu_index(i), current_lwe_array_out,
current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
current_lwe_array_in, current_lwe_input_indexes,

View File

@@ -35,12 +35,13 @@ namespace cg = cooperative_groups;
*
* Each y-block computes one element of the lwe_array_out.
*/
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_cg(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -79,7 +80,7 @@ __global__ void device_programmable_bootstrap_cg(
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
@@ -95,8 +96,8 @@ __global__ void device_programmable_bootstrap_cg(
// Put "b" in [0, 2N[
constexpr auto log_modulus = params::log2_degree + 1;
Torus b_hat = 0;
Torus correction = 0;
InputTorus b_hat = 0;
InputTorus correction = 0;
if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
correction = centered_binary_modulus_switch_body_correction_to_add(
block_lwe_array_in, lwe_dimension, log_modulus);
@@ -113,7 +114,7 @@ __global__ void device_programmable_bootstrap_cg(
__syncthreads();
// Put "a" in [0, 2N[
Torus a_hat = 0;
InputTorus a_hat = 0;
modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
// Perform ACC * (X^ä - 1)
@@ -193,7 +194,7 @@ __global__ void device_programmable_bootstrap_cg(
}
}
template <typename Torus, typename params>
template <typename InputTorus, typename Torus, typename params>
__host__ uint64_t scratch_programmable_bootstrap_cg(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -210,18 +211,18 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
@@ -237,11 +238,11 @@ __host__ uint64_t scratch_programmable_bootstrap_cg(
/*
* Host wrapper
*/
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void host_programmable_bootstrap_cg(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -293,26 +294,29 @@ __host__ void host_programmable_bootstrap_cg(
if (max_shared_memory < partial_sm) {
kernel_args[13] = &full_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, grid,
thds, (void **)kernel_args, 0, stream));
(void *)
device_programmable_bootstrap_cg<InputTorus, Torus, params, NOSM>,
grid, thds, (void **)kernel_args, 0, stream));
} else if (max_shared_memory < full_sm) {
kernel_args[13] = &partial_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
(void *)device_programmable_bootstrap_cg<InputTorus, Torus, params,
PARTIALSM>,
grid, thds, (void **)kernel_args, partial_sm, stream));
} else {
int no_dm = 0;
kernel_args[13] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, grid,
thds, (void **)kernel_args, full_sm, stream));
(void *)
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
grid, thds, (void **)kernel_args, full_sm, stream));
}
check_cuda_error(cudaGetLastError());
}
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
int glwe_dimension, int level_count, int num_samples,
uint32_t max_shared_memory) {
@@ -338,30 +342,34 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
if (max_shared_memory < partial_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_cg<Torus, params, NOSM>, thds, 0);
(void *)
device_programmable_bootstrap_cg<InputTorus, Torus, params, NOSM>,
thds, 0);
} else if (max_shared_memory < full_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
device_programmable_bootstrap_cg<InputTorus, Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
(void *)device_programmable_bootstrap_cg<InputTorus, Torus, params,
PARTIALSM>,
thds, partial_sm);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
cudaFuncCachePreferShared);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_cg<Torus, params, FULLSM>, thds,
full_sm);
(void *)
device_programmable_bootstrap_cg<InputTorus, Torus, params, FULLSM>,
thds, full_sm);
check_cuda_error(cudaGetLastError());
}
@@ -372,39 +380,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
}
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus>
template <typename InputTorus, typename Torus>
__host__ bool supports_cooperative_groups_on_programmable_bootstrap(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 512:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 1024:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 2048:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 4096:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 8192:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 16384:
return verify_cuda_programmable_bootstrap_cg_grid_size<
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<16384>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. "
"Supported N's are powers of two"

View File

@@ -18,13 +18,14 @@
#include "types/complex/operations.cuh"
#include <vector>
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_cg_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension,
@@ -60,7 +61,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
@@ -80,7 +81,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
if (lwe_offset == 0) {
// Put "b" in [0, 2N[
Torus b_hat = 0;
InputTorus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
@@ -208,7 +209,7 @@ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename params>
template <typename InputTorus, typename Torus, typename params>
__host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -230,57 +231,59 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
if (max_shared_memory < full_sm_keybundle) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
if (max_shared_memory < partial_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
NOSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
NOSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory < full_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<
InputTorus, Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<
InputTorus, Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
level_count, full_sm_keybundle);
uint64_t size_tracker = 0;
@@ -291,10 +294,10 @@ __host__ uint64_t scratch_cg_multi_bit_programmable_bootstrap(
return size_tracker;
}
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void execute_cg_external_product_loop(
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus *lwe_array_out,
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -359,28 +362,28 @@ __host__ void execute_cg_external_product_loop(
kernel_args[19] = &full_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, NOSM>,
InputTorus, Torus, params, NOSM>,
grid_accumulate, thds, (void **)kernel_args, 0, stream));
} else if (max_shared_memory < full_dm) {
kernel_args[19] = &partial_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, PARTIALSM>,
InputTorus, Torus, params, PARTIALSM>,
grid_accumulate, thds, (void **)kernel_args, partial_sm, stream));
} else {
kernel_args[19] = &no_dm;
check_cuda_error(cudaLaunchCooperativeKernel(
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
grid_accumulate, thds, (void **)kernel_args, full_sm, stream));
}
}
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void host_cg_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, uint64_t const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -393,13 +396,13 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
lwe_offset += lwe_chunk_size) {
// Compute a keybundle
execute_compute_keybundle<Torus, params>(
execute_compute_keybundle<InputTorus, Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
// Accumulate
execute_cg_external_product_loop<Torus, params>(
execute_cg_external_product_loop<InputTorus, Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
@@ -409,7 +412,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
}
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
int glwe_dimension, int level_count, int num_samples,
uint32_t max_shared_memory) {
@@ -436,36 +439,36 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, NOSM>,
InputTorus, Torus, params, NOSM>,
thds, 0);
} else if (max_shared_memory < full_sm_cg_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<
InputTorus, Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<
InputTorus, Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, PARTIALSM>,
InputTorus, Torus, params, PARTIALSM>,
thds, partial_sm_cg_accumulate);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_cg_accumulate<InputTorus, Torus,
params, FULLSM>,
cudaFuncCachePreferShared);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_multi_bit_programmable_bootstrap_cg_accumulate<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
thds, full_sm_cg_accumulate);
check_cuda_error(cudaGetLastError());
}
@@ -478,39 +481,39 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
// Verify if the grid size for the multi-bit kernel satisfies the cooperative
// group constraints
template <typename Torus>
template <typename InputTorus, typename Torus>
__host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
int glwe_dimension, int polynomial_size, int level_count, int num_samples,
uint32_t max_shared_memory) {
switch (polynomial_size) {
case 256:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<256>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 512:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<512>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 1024:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<1024>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 2048:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<2048>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 4096:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<4096>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 8192:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<8192>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
case 16384:
return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<16384>>(
glwe_dimension, level_count, num_samples, max_shared_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"

View File

@@ -7,18 +7,19 @@
#include <stdio.h>
template <typename Torus>
template <typename InputTorus, typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples,
uint32_t max_shared_memory) {
return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
return supports_cooperative_groups_on_programmable_bootstrap<InputTorus,
Torus>(
glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory);
}
template <typename Torus>
template <typename InputTorus, typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
@@ -28,38 +29,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
switch (polynomial_size) {
case 256:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 512:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 1024:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 2048:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, Degree<2048>>(num_samples, glwe_dimension, polynomial_size,
level_count, max_shared_memory);
InputTorus, Torus, Degree<2048>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 4096:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 8192:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 16384:
return supports_thread_block_clusters_on_classic_programmable_bootstrap<
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
default:
PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported "
"N's are powers of two"
@@ -71,7 +73,7 @@ bool has_support_to_cuda_programmable_bootstrap_tbc(
}
#if (CUDA_ARCH >= 900)
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_programmable_bootstrap_tbc(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -80,37 +82,43 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 512:
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 1024:
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 2048:
return scratch_programmable_bootstrap_tbc<Torus, Degree<2048>>(
return scratch_programmable_bootstrap_tbc<InputTorus, Torus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 4096:
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 8192:
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 16384:
return scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
return scratch_programmable_bootstrap_tbc<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -121,11 +129,11 @@ uint64_t scratch_cuda_programmable_bootstrap_tbc(
}
}
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -134,7 +142,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -142,7 +150,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -150,7 +158,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -158,7 +166,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap_tbc<Torus, Degree<2048>>(
host_programmable_bootstrap_tbc<InputTorus, Torus, Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -166,7 +174,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -174,7 +182,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 8192:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -182,7 +190,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 16384:
host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
host_programmable_bootstrap_tbc<InputTorus, Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -197,7 +205,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
}
#endif
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -206,37 +214,44 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 512:
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 1024:
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 2048:
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 4096:
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 8192:
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 16384:
return scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
return scratch_programmable_bootstrap_cg<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -247,7 +262,7 @@ uint64_t scratch_cuda_programmable_bootstrap_cg(
}
}
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -256,37 +271,44 @@ uint64_t scratch_cuda_programmable_bootstrap(
switch (polynomial_size) {
case 256:
return scratch_programmable_bootstrap<Torus, AmortizedDegree<256>>(
return scratch_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 512:
return scratch_programmable_bootstrap<Torus, AmortizedDegree<512>>(
return scratch_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 1024:
return scratch_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
return scratch_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 2048:
return scratch_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
return scratch_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 4096:
return scratch_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
return scratch_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 8192:
return scratch_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
return scratch_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
case 16384:
return scratch_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
return scratch_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
@@ -299,11 +321,10 @@ uint64_t scratch_cuda_programmable_bootstrap(
/*
* This scratch function allocates the necessary amount of data on the GPU for
* the classical PBS on 32 bits inputs, into `buffer`. It also
* configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
* be used.
* the PBS on 64 bits inputs, into `buffer`. It also configures SM options on
* the GPU in case FULLSM or PARTIALSM mode is going to be used.
*/
uint64_t scratch_cuda_programmable_bootstrap_32(
uint64_t scratch_cuda_programmable_bootstrap_32_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
@@ -311,25 +332,25 @@ uint64_t scratch_cuda_programmable_bootstrap_32(
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, max_shared_memory))
return scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
return scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
else
#endif
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
return scratch_cuda_programmable_bootstrap_cg<uint32_t>(
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
return scratch_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
else
return scratch_cuda_programmable_bootstrap<uint32_t>(
stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
return scratch_cuda_programmable_bootstrap<uint32_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
}
@@ -347,34 +368,34 @@ uint64_t scratch_cuda_programmable_bootstrap_64(
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
#if (CUDA_ARCH >= 900)
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, max_shared_memory))
return scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
return scratch_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
else
#endif
if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, max_shared_memory))
return scratch_cuda_programmable_bootstrap_cg<uint64_t>(
return scratch_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
else
return scratch_cuda_programmable_bootstrap<uint64_t>(
return scratch_cuda_programmable_bootstrap<uint64_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
lwe_dimension, glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory, noise_reduction_type);
}
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -383,7 +404,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -391,7 +412,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -399,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -407,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -415,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -423,7 +444,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 8192:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -431,7 +452,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 16384:
host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
host_programmable_bootstrap_cg<InputTorus, Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -445,11 +466,11 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
}
}
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -458,7 +479,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
switch (polynomial_size) {
case 256:
host_programmable_bootstrap<Torus, AmortizedDegree<256>>(
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -466,7 +487,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 512:
host_programmable_bootstrap<Torus, AmortizedDegree<512>>(
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -474,7 +495,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 1024:
host_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -482,7 +503,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 2048:
host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -490,7 +511,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 4096:
host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -498,7 +519,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 8192:
host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -506,7 +527,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
num_many_lut, lut_stride);
break;
case 16384:
host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
host_programmable_bootstrap<InputTorus, Torus, AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
@@ -522,7 +543,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
@@ -530,23 +551,24 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_many_lut, uint32_t lut_stride) {
if (base_log > 64)
PANIC("Cuda error (classical PBS): base log should be <= 64")
if (base_log > 32)
PANIC("Cuda error (classical PBS): base log should be <= 32")
pbs_buffer<uint64_t, CLASSICAL> *buffer =
(pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
pbs_buffer<uint32_t, CLASSICAL> *buffer =
(pbs_buffer<uint32_t, CLASSICAL> *)mem_ptr;
check_cuda_error(cudaGetLastError());
switch (buffer->pbs_variant) {
case TBC:
#if CUDA_ARCH >= 900
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<const uint32_t *>(lwe_output_indexes),
static_cast<const uint32_t *>(lut_vector),
static_cast<const uint32_t *>(lut_vector_indexes),
case PBS_VARIANT::TBC:
#if (CUDA_ARCH >= 900)
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
@@ -554,26 +576,26 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
#else
PANIC("Cuda error (PBS): TBC pbs is not supported.")
#endif
case CG:
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<const uint32_t *>(lwe_output_indexes),
static_cast<const uint32_t *>(lut_vector),
static_cast<const uint32_t *>(lut_vector_indexes),
case PBS_VARIANT::CG:
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t, uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
break;
case DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
static_cast<const uint32_t *>(lwe_output_indexes),
static_cast<const uint32_t *>(lut_vector),
static_cast<const uint32_t *>(lut_vector_indexes),
case PBS_VARIANT::DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t, uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint32_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const double2 *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count, num_samples,
num_many_lut, lut_stride);
@@ -644,7 +666,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
void cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
@@ -663,7 +685,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
switch (buffer->pbs_variant) {
case PBS_VARIANT::TBC:
#if (CUDA_ARCH >= 900)
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
@@ -678,7 +700,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
PANIC("Cuda error (PBS): TBC pbs is not supported.")
#endif
case PBS_VARIANT::CG:
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
@@ -690,7 +712,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
@@ -705,7 +727,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
PANIC("Cuda error (PBS): unknown pbs variant.")
}
}
/*
* This cleanup function frees the data on GPU for the PBS buffer for 32 or 64
* bits inputs.
@@ -718,11 +739,16 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
*buffer = nullptr;
}
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
template bool has_support_to_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory);
template void
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -732,7 +758,8 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
template void
cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -742,14 +769,14 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template uint64_t scratch_cuda_programmable_bootstrap_cg<uint64_t>(
template uint64_t scratch_cuda_programmable_bootstrap_cg<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
template uint64_t scratch_cuda_programmable_bootstrap<uint64_t>(
template uint64_t scratch_cuda_programmable_bootstrap<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
@@ -765,38 +792,31 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
uint32_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template uint64_t scratch_cuda_programmable_bootstrap_cg<uint32_t>(
template uint64_t scratch_cuda_programmable_bootstrap_cg<uint32_t, uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
template uint64_t scratch_cuda_programmable_bootstrap<uint32_t>(
template uint64_t scratch_cuda_programmable_bootstrap<uint32_t, uint32_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t input_lwe_ciphertext_count,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
template bool
has_support_to_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
template bool
has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if CUDA_ARCH >= 900
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
template void
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint32_t>(
void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
uint32_t const *lwe_output_indexes, uint32_t const *lut_vector,
uint32_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
@@ -805,7 +825,8 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
template void
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -814,18 +835,35 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
template void
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t, uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
uint64_t const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride);
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint32_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
template uint64_t scratch_cuda_programmable_bootstrap_tbc<uint32_t, uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);
template bool
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
__uint128_t>(uint32_t polynomial_size, uint32_t max_shared_memory);

View File

@@ -17,12 +17,13 @@
#include "polynomial/polynomial_math.cuh"
#include "types/complex/operations.cuh"
template <typename Torus, class params, sharedMemDegree SMD, bool first_iter>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD, bool first_iter>
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_one(
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes, Torus *global_accumulator,
double2 *global_join_buffer, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -55,7 +56,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
@@ -75,8 +76,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// First iteration
// Put "b" in [0, 2N[
constexpr auto log_modulus = params::log2_degree + 1;
Torus b_hat = 0;
Torus correction = 0;
InputTorus b_hat = 0;
InputTorus correction = 0;
if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
correction = centered_binary_modulus_switch_body_correction_to_add(
block_lwe_array_in, lwe_dimension, log_modulus);
@@ -86,6 +87,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// The y-dimension is used to select the element of the GLWE this block will
// compute
// b_hat is cast from InputTorus (i.e. 64b or 32b) to uint32_t here
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
@@ -100,7 +102,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
}
// Put "a" in [0, 2N[
Torus a_hat = 0;
InputTorus a_hat = 0;
modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
params::log2_degree + 1); // 2 * params::log2_degree + 1);
@@ -307,7 +309,7 @@ uint64_t get_buffer_size_programmable_bootstrap(
return buffer_size + buffer_size % sizeof(double2);
}
template <typename Torus, typename params>
template <typename InputTorus, typename Torus, typename params>
__host__ uint64_t scratch_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -331,30 +333,38 @@ __host__ uint64_t scratch_programmable_bootstrap(
// Configure step one
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, true>,
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
PARTIALSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, true>,
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
PARTIALSM, true>,
cudaFuncCachePreferShared);
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, false>,
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
PARTIALSM, false>,
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, false>,
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
PARTIALSM, false>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one<Torus, params, FULLSM, true>,
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
FULLSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one<Torus, params, FULLSM, true>,
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
FULLSM, true>,
cudaFuncCachePreferShared);
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_step_one<Torus, params, FULLSM, false>,
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
FULLSM, false>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_step_one<Torus, params, FULLSM, false>,
device_programmable_bootstrap_step_one<InputTorus, Torus, params,
FULLSM, false>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
@@ -398,10 +408,10 @@ __host__ uint64_t scratch_programmable_bootstrap(
return size_tracker;
}
template <typename Torus, class params, bool first_iter>
template <typename InputTorus, typename Torus, class params, bool first_iter>
__host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
Torus *global_accumulator, double2 *global_join_buffer,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
@@ -416,21 +426,24 @@ __host__ void execute_step_one(
dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1, level_count);
if (max_shared_memory < partial_sm) {
device_programmable_bootstrap_step_one<Torus, params, NOSM, first_iter>
device_programmable_bootstrap_step_one<InputTorus, Torus, params, NOSM,
first_iter>
<<<grid, thds, 0, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
global_accumulator, global_join_buffer, lwe_iteration,
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
full_dm, noise_reduction_type);
} else if (max_shared_memory < full_sm) {
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM, first_iter>
device_programmable_bootstrap_step_one<InputTorus, Torus, params, PARTIALSM,
first_iter>
<<<grid, thds, partial_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
global_accumulator, global_join_buffer, lwe_iteration,
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
partial_dm, noise_reduction_type);
} else {
device_programmable_bootstrap_step_one<Torus, params, FULLSM, first_iter>
device_programmable_bootstrap_step_one<InputTorus, Torus, params, FULLSM,
first_iter>
<<<grid, thds, full_sm, stream>>>(
lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
global_accumulator, global_join_buffer, lwe_iteration,
@@ -484,11 +497,11 @@ __host__ void execute_step_two(
/*
* Host wrapper to the programmable bootstrap
*/
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void host_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -520,7 +533,7 @@ __host__ void host_programmable_bootstrap(
for (int i = 0; i < lwe_dimension; i++) {
if (i == 0) {
execute_step_one<Torus, params, true>(
execute_step_one<InputTorus, Torus, params, true>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,
@@ -528,7 +541,7 @@ __host__ void host_programmable_bootstrap(
partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one,
noise_reduction_type);
} else {
execute_step_one<Torus, params, false>(
execute_step_one<InputTorus, Torus, params, false>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, global_accumulator,
global_join_buffer, input_lwe_ciphertext_count, lwe_dimension,

View File

@@ -7,15 +7,16 @@
#include "programmable_bootstrap_tbc_multibit.cuh"
#endif
// Used in CPP benchmarks, not used in rust code
bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples, uint32_t max_shared_memory) {
return supports_cooperative_groups_on_multibit_programmable_bootstrap<
uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
max_shared_memory);
uint64_t, uint64_t>(glwe_dimension, polynomial_size, level_count,
num_samples, max_shared_memory);
}
template <typename Torus>
template <typename InputTorus, typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
@@ -26,39 +27,39 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
switch (polynomial_size) {
case 256:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 512:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 1024:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 2048:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 4096:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 8192:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
case 16384:
return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
InputTorus, Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
polynomial_size, level_count,
max_shared_memory);
default:
PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
"N's are powers of two"
@@ -69,11 +70,11 @@ bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
#endif
}
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -82,7 +83,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
switch (polynomial_size) {
case 256:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -90,7 +92,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -98,7 +101,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -106,7 +110,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 2048:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -114,7 +119,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 4096:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -122,7 +128,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 8192:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -130,7 +137,8 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 16384:
host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
host_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -144,11 +152,11 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
}
}
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -157,7 +165,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
switch (polynomial_size) {
case 256:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -165,7 +174,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -173,7 +183,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -181,7 +192,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 2048:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -189,7 +201,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 4096:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -197,7 +210,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 8192:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -205,7 +219,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 16384:
host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
host_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -239,7 +254,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
switch (buffer->pbs_variant) {
case PBS_VARIANT::TBC:
#if CUDA_ARCH >= 900
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
@@ -254,7 +270,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
case PBS_VARIANT::CG:
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
@@ -266,7 +283,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
num_samples, num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
@@ -282,7 +300,73 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
}
}
template <typename Torus>
void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
void *stream, uint32_t gpu_index, void *lwe_array_out,
void const *lwe_output_indexes, void const *lut_vector,
void const *lut_vector_indexes, void const *lwe_array_in,
void const *lwe_input_indexes, void const *bootstrapping_key,
int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
uint32_t lut_stride) {
PANIC_IF_FALSE(base_log <= 64,
"Cuda error (multi-bit PBS): base log (%d) should be <= 64",
base_log);
pbs_buffer<uint64_t, MULTI_BIT> *buffer =
(pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
switch (buffer->pbs_variant) {
case PBS_VARIANT::TBC:
#if CUDA_ARCH >= 900
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
#else
PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
#endif
case PBS_VARIANT::CG:
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
case PBS_VARIANT::DEFAULT:
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
uint64_t>(
stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_output_indexes),
static_cast<const uint64_t *>(lut_vector),
static_cast<const uint64_t *>(lut_vector_indexes),
static_cast<const uint32_t *>(lwe_array_in),
static_cast<const uint64_t *>(lwe_input_indexes),
static_cast<const uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
num_samples, num_many_lut, lut_stride);
break;
default:
PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
}
}
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -290,43 +374,43 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
switch (polynomial_size) {
case 256:
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 8192:
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 16384:
return scratch_cg_multi_bit_programmable_bootstrap<Torus,
return scratch_cg_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -338,7 +422,7 @@ uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap(
}
}
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -346,43 +430,43 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap(
switch (polynomial_size) {
case 256:
return scratch_multi_bit_programmable_bootstrap<Torus,
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_multi_bit_programmable_bootstrap<Torus,
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_multi_bit_programmable_bootstrap<Torus,
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_multi_bit_programmable_bootstrap<Torus,
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
return scratch_multi_bit_programmable_bootstrap<Torus,
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 8192:
return scratch_multi_bit_programmable_bootstrap<Torus,
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 16384:
return scratch_multi_bit_programmable_bootstrap<Torus,
return scratch_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -400,7 +484,8 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
bool supports_cg =
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t>(
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint64_t,
uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
#if (CUDA_ARCH >= 900)
@@ -412,24 +497,70 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_64(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
bool supports_tbc =
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t,
uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, cuda_get_max_shared_memory(gpu_index));
if (supports_tbc)
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t,
uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
else
#endif
if (supports_cg)
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
else
return scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
return scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
}
uint64_t scratch_cuda_multi_bit_programmable_bootstrap_32_64(
void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
bool supports_cg =
supports_cooperative_groups_on_multibit_programmable_bootstrap<uint32_t,
uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
#if (CUDA_ARCH >= 900)
// On H100s we should be using TBC until num_samples < num_sms / 2.
// After that we switch to CG until not supported anymore.
// At this point we return to TBC.
int num_sms = 0;
check_cuda_error(cudaDeviceGetAttribute(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
bool supports_tbc =
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint32_t,
uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
level_count, cuda_get_max_shared_memory(gpu_index));
if (supports_tbc)
return scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint32_t,
uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
else
#endif
if (supports_cg)
return scratch_cuda_cg_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
else
return scratch_cuda_multi_bit_programmable_bootstrap<uint32_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count, allocate_gpu_memory);
@@ -455,7 +586,7 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
* The value 13 was empirically determined based on memory requirements for
* benchmarking on an RTX 4090 GPU, balancing performance and resource use.
*/
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
uint32_t polynomial_size, uint32_t glwe_dimension,
uint32_t level_count, uint64_t full_sm_keybundle) {
@@ -466,13 +597,14 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
if (max_shared_memory < full_sm_keybundle)
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, NOSM>,
polynomial_size / params::opt, full_sm_keybundle);
else
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_blocks_per_sm,
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, FULLSM>,
polynomial_size / params::opt, 0);
int num_sms = 0;
@@ -535,14 +667,15 @@ uint64_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
return (max_num_chunks > divisor) ? divisor : max_num_chunks;
}
template uint64_t scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
template uint64_t
scratch_cuda_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template void
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -552,14 +685,16 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride);
template uint64_t scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
template uint64_t
scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template void
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -570,12 +705,12 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint32_t num_many_lut, uint32_t lut_stride);
template bool
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint32_t, uint64_t>(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory);
#if (CUDA_ARCH >= 900)
template <typename Torus>
template <typename InputTorus, typename Torus>
uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
@@ -583,43 +718,43 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
switch (polynomial_size) {
case 256:
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 512:
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 1024:
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 2048:
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 4096:
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 8192:
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
allocate_gpu_memory);
case 16384:
return scratch_tbc_multi_bit_programmable_bootstrap<Torus,
return scratch_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
polynomial_size, level_count, input_lwe_ciphertext_count,
@@ -630,11 +765,11 @@ uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap(
" in the interval [256..16384].")
}
}
template <typename Torus>
template <typename InputTorus, typename Torus>
void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
void *stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -646,7 +781,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
switch (polynomial_size) {
case 256:
host_tbc_multi_bit_programmable_bootstrap<uint64_t, AmortizedDegree<256>>(
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -654,7 +790,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 512:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -662,7 +799,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 1024:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -675,14 +813,16 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
&num_sms, cudaDevAttrMultiProcessorCount, gpu_index));
if (4 * num_sms < num_samples * level_count * (glwe_dimension + 1))
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
lwe_dimension, polynomial_size, grouping_factor, base_log,
level_count, num_samples, num_many_lut, lut_stride);
else
host_tbc_multi_bit_programmable_bootstrap<Torus, Degree<2048>>(
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -692,7 +832,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
break;
}
case 4096:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -700,7 +841,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 8192:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<8192>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -708,7 +850,8 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
num_samples, num_many_lut, lut_stride);
break;
case 16384:
host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
host_tbc_multi_bit_programmable_bootstrap<InputTorus, Torus,
AmortizedDegree<16384>>(
static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
@@ -722,13 +865,15 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
}
}
template uint64_t scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
template uint64_t
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
template void
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint64_t const *lwe_array_in,
@@ -737,4 +882,16 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride);
template void
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint32_t,
uint64_t>(
void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
uint64_t const *lwe_output_indexes, uint64_t const *lut_vector,
uint64_t const *lut_vector_indexes, uint32_t const *lwe_array_in,
uint64_t const *lwe_input_indexes, uint64_t const *bootstrapping_key,
pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_many_lut, uint32_t lut_stride);
#endif

View File

@@ -25,9 +25,10 @@ get_start_ith_ggsw_offset(uint32_t polynomial_size, int glwe_dimension,
level_count;
}
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_keybundle(
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -55,7 +56,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
double2 *keybundle = keybundle_array +
@@ -86,10 +87,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
// Precalculate the monomial degrees and store them in shared memory
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
if (threadIdx.x < (1 << grouping_factor)) {
const Torus *lwe_array_group =
const InputTorus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
monomial_degrees[threadIdx.x] =
calculates_monomial_degree<InputTorus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
__syncthreads();
@@ -145,9 +147,10 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
// Then we can just calculate the offset needed to apply this coefficients, and
// the operation transforms into a pointwise vector multiplication, avoiding to
// perform extra instructions other than MADD
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes, double2 *keybundle_array,
const Torus *__restrict__ bootstrapping_key, uint32_t lwe_dimension,
uint32_t lwe_offset, uint64_t lwe_chunk_size,
@@ -188,7 +191,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
if (lwe_iteration < (lwe_dimension / grouping_factor)) {
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
double2 *keybundle = keybundle_array +
@@ -219,10 +222,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
uint32_t *monomial_degrees = (uint32_t *)selected_memory;
if (threadIdx.x < (1 << grouping_factor)) {
const Torus *lwe_array_group =
const InputTorus *lwe_array_group =
block_lwe_array_in + rev_lwe_iteration * grouping_factor;
monomial_degrees[threadIdx.x] = calculates_monomial_degree<Torus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
monomial_degrees[threadIdx.x] =
calculates_monomial_degree<InputTorus, params>(
lwe_array_group, threadIdx.x, grouping_factor);
}
__syncthreads();
@@ -269,10 +273,11 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle_2_2_params(
}
}
template <typename Torus, class params, sharedMemDegree SMD, bool is_first_iter>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD, bool is_first_iter>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_accumulate_step_one(
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
@@ -305,7 +310,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
if constexpr (SMD == PARTIALSM)
accumulator_fft = (double2 *)sharedmem;
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
@@ -327,7 +332,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
////////////////////////////////////////////////////////////
// Initializes the accumulator with the body of LWE
// Put "b" in [0, 2N[
Torus b_hat = 0;
InputTorus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
@@ -501,7 +506,7 @@ uint64_t get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two(
return sizeof(Torus) * polynomial_size; // accumulator
}
template <typename Torus, typename params>
template <typename InputTorus, typename Torus, typename params>
__host__ uint64_t scratch_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -526,20 +531,22 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
if (max_shared_memory < full_sm_keybundle) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
@@ -547,59 +554,59 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
if (max_shared_memory < partial_sm_accumulate_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, NOSM, false>,
InputTorus, Torus, params, NOSM, false>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, NOSM, false>,
InputTorus, Torus, params, NOSM, false>,
cudaFuncCachePreferShared);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, NOSM, true>,
InputTorus, Torus, params, NOSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, NOSM, true>,
InputTorus, Torus, params, NOSM, true>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory < full_sm_accumulate_step_one) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, PARTIALSM, false>,
InputTorus, Torus, params, PARTIALSM, false>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm_accumulate_step_one));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, PARTIALSM, false>,
InputTorus, Torus, params, PARTIALSM, false>,
cudaFuncCachePreferShared);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, PARTIALSM, true>,
InputTorus, Torus, params, PARTIALSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm_accumulate_step_one));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, PARTIALSM, true>,
InputTorus, Torus, params, PARTIALSM, true>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, FULLSM, false>,
InputTorus, Torus, params, FULLSM, false>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_accumulate_step_one));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, FULLSM, false>,
InputTorus, Torus, params, FULLSM, false>,
cudaFuncCachePreferShared);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, FULLSM, true>,
InputTorus, Torus, params, FULLSM, true>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_accumulate_step_one));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, FULLSM, true>,
InputTorus, Torus, params, FULLSM, true>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
@@ -644,7 +651,7 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
level_count, full_sm_keybundle);
uint64_t size_tracker = 0;
@@ -655,9 +662,9 @@ __host__ uint64_t scratch_multi_bit_programmable_bootstrap(
return size_tracker;
}
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void execute_compute_keybundle(
cudaStream_t stream, uint32_t gpu_index, Torus const *lwe_array_in,
cudaStream_t stream, uint32_t gpu_index, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -686,7 +693,8 @@ __host__ void execute_compute_keybundle(
dim3 thds(polynomial_size / params::opt, 1, 1);
if (max_shared_memory < full_sm_keybundle) {
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus, params,
NOSM>
<<<grid_keybundle, thds, 0, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
@@ -694,7 +702,8 @@ __host__ void execute_compute_keybundle(
d_mem, full_sm_keybundle);
} else {
bool supports_tbc =
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<InputTorus,
Torus>(
num_samples, glwe_dimension, polynomial_size, level_count,
cuda_get_max_shared_memory(gpu_index));
@@ -703,20 +712,22 @@ __host__ void execute_compute_keybundle(
dim3 thds_new_keybundle(512, 1, 1);
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM>,
InputTorus, Torus, Degree<2048>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 3 * full_sm_keybundle));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM>,
InputTorus, Torus, Degree<2048>, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
device_multi_bit_programmable_bootstrap_keybundle_2_2_params<
Torus, Degree<2048>, FULLSM><<<grid_keybundle, thds_new_keybundle,
3 * full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, lwe_offset, chunk_size, keybundle_size_per_input);
InputTorus, Torus, Degree<2048>, FULLSM>
<<<grid_keybundle, thds_new_keybundle, 3 * full_sm_keybundle,
stream>>>(lwe_array_in, lwe_input_indexes, keybundle_fft,
bootstrapping_key, lwe_dimension, lwe_offset, chunk_size,
keybundle_size_per_input);
} else {
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, FULLSM>
<<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
@@ -727,10 +738,10 @@ __host__ void execute_compute_keybundle(
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params, bool is_first_iter>
template <typename InputTorus, typename Torus, class params, bool is_first_iter>
__host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count) {
@@ -754,7 +765,7 @@ __host__ void execute_step_one(
if (max_shared_memory < partial_sm_accumulate_step_one)
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, NOSM, is_first_iter>
InputTorus, Torus, params, NOSM, is_first_iter>
<<<grid_accumulate_step_one, thds, 0, stream>>>(
lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
global_accumulator, global_accumulator_fft, lwe_dimension,
@@ -762,7 +773,7 @@ __host__ void execute_step_one(
full_sm_accumulate_step_one);
else if (max_shared_memory < full_sm_accumulate_step_one)
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, PARTIALSM, is_first_iter>
InputTorus, Torus, params, PARTIALSM, is_first_iter>
<<<grid_accumulate_step_one, thds, partial_sm_accumulate_step_one,
stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
lut_vector_indexes, global_accumulator,
@@ -771,7 +782,7 @@ __host__ void execute_step_one(
partial_sm_accumulate_step_one);
else
device_multi_bit_programmable_bootstrap_accumulate_step_one<
Torus, params, FULLSM, is_first_iter>
InputTorus, Torus, params, FULLSM, is_first_iter>
<<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
stream>>>(lwe_array_in, lwe_input_indexes, lut_vector,
lut_vector_indexes, global_accumulator,
@@ -823,11 +834,11 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
check_cuda_error(cudaGetLastError());
}
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void host_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -840,7 +851,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
lwe_offset += lwe_chunk_size) {
// Compute a keybundle
execute_compute_keybundle<Torus, params>(
execute_compute_keybundle<InputTorus, Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
@@ -853,12 +864,12 @@ __host__ void host_multi_bit_programmable_bootstrap(
bool is_last_iter =
(j + lwe_offset) + 1 == (lwe_dimension / grouping_factor);
if (is_first_iter) {
execute_step_one<Torus, params, true>(
execute_step_one<InputTorus, Torus, params, true>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);
} else {
execute_step_one<Torus, params, false>(
execute_step_one<InputTorus, Torus, params, false>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension,
glwe_dimension, polynomial_size, base_log, level_count);

View File

@@ -98,9 +98,9 @@ uint64_t scratch_cuda_multi_bit_programmable_bootstrap_128_vector_64(
bool supports_cg =
supports_cooperative_groups_on_multibit_programmable_bootstrap<
__uint128_t>(glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
uint64_t, __uint128_t>(glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count,
cuda_get_max_shared_memory(gpu_index));
if (supports_cg)
return scratch_cuda_cg_multi_bit_programmable_bootstrap_128<uint64_t>(

View File

@@ -35,12 +35,13 @@ namespace cg = cooperative_groups;
*
* Each y-block computes one element of the lwe_array_out.
*/
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_tbc(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -83,7 +84,7 @@ __global__ void device_programmable_bootstrap_tbc(
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
@@ -99,8 +100,8 @@ __global__ void device_programmable_bootstrap_tbc(
// Put "b" in [0, 2N[
constexpr auto log_modulus = params::log2_degree + 1;
Torus b_hat = 0;
Torus correction = 0;
InputTorus b_hat = 0;
InputTorus correction = 0;
if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
correction = centered_binary_modulus_switch_body_correction_to_add(
block_lwe_array_in, lwe_dimension, log_modulus);
@@ -117,7 +118,7 @@ __global__ void device_programmable_bootstrap_tbc(
__syncthreads();
// Put "a" in [0, 2N[
Torus a_hat = 0;
InputTorus a_hat = 0;
modulus_switch(block_lwe_array_in[i], a_hat, log_modulus);
// Perform ACC * (X^ä - 1)
@@ -200,12 +201,13 @@ __global__ void device_programmable_bootstrap_tbc(
}
}
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_tbc_2_2_params(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
uint32_t lwe_dimension, uint32_t num_many_lut, uint32_t lut_stride,
@@ -247,7 +249,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
}
// The third dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
@@ -263,8 +265,8 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
// Put "b" in [0, 2N[
constexpr auto log_modulus = params::log2_degree + 1;
Torus b_hat = 0;
Torus correction = 0;
InputTorus b_hat = 0;
InputTorus correction = 0;
if (noise_reduction_type == PBS_MS_REDUCTION_T::CENTERED) {
correction = centered_binary_modulus_switch_body_correction_to_add(
block_lwe_array_in, lwe_dimension, log_modulus);
@@ -276,7 +278,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
params::degree / params::opt>(
accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
false);
Torus temp_a_hat = 0;
InputTorus temp_a_hat = 0;
for (int i = 0; i < lwe_dimension; i++) {
// We calculate the modulus switch of a warp size of elements
@@ -382,7 +384,7 @@ __global__ void device_programmable_bootstrap_tbc_2_2_params(
cluster.sync();
}
template <typename Torus, typename params>
template <typename InputTorus, typename Torus, typename params>
__host__ uint64_t scratch_programmable_bootstrap_tbc(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t lwe_dimension,
@@ -410,27 +412,27 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
if (max_shared_memory >= full_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm + minimum_sm_tbc));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
cudaFuncSetCacheConfig(
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
@@ -446,11 +448,11 @@ __host__ uint64_t scratch_programmable_bootstrap_tbc(
/*
* Host wrapper
*/
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void host_programmable_bootstrap_tbc(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, double2 const *bootstrapping_key,
pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
@@ -506,7 +508,8 @@ __host__ void host_programmable_bootstrap_tbc(
config.dynamicSmemBytes = minimum_sm_tbc;
check_cuda_error(cudaLaunchKernelEx(
&config, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
&config,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
@@ -515,7 +518,8 @@ __host__ void host_programmable_bootstrap_tbc(
config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
check_cuda_error(cudaLaunchKernelEx(
&config, device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
&config,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem,
@@ -530,18 +534,22 @@ __host__ void host_programmable_bootstrap_tbc(
config.dynamicSmemBytes = full_sm_2_2;
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_2_2));
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
params, FULLSM>,
cudaFuncAttributePreferredSharedMemoryCarveout,
cudaSharedmemCarveoutMaxShared));
check_cuda_error(cudaFuncSetCacheConfig(
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
params, FULLSM>,
cudaFuncCachePreferShared));
check_cuda_error(cudaLaunchKernelEx(
&config,
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
params, FULLSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, num_many_lut, lut_stride, noise_reduction_type));
@@ -549,7 +557,8 @@ __host__ void host_programmable_bootstrap_tbc(
config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
check_cuda_error(cudaLaunchKernelEx(
&config, device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
&config,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
@@ -560,7 +569,7 @@ __host__ void host_programmable_bootstrap_tbc(
}
// Verify if the grid size satisfies the cooperative group constraints
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
int glwe_dimension, int level_count, int num_samples,
uint32_t max_shared_memory) {
@@ -585,18 +594,21 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
if (max_shared_memory < partial_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_tbc<Torus, params, NOSM>, thds,
0);
(void *)
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
thds, 0);
} else if (max_shared_memory < full_sm) {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
(void *)device_programmable_bootstrap_tbc<InputTorus, Torus, params,
PARTIALSM>,
thds, partial_sm);
} else {
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active_blocks_per_sm,
(void *)device_programmable_bootstrap_tbc<Torus, params, FULLSM>, thds,
full_sm);
(void *)device_programmable_bootstrap_tbc<InputTorus, Torus, params,
FULLSM>,
thds, full_sm);
}
// Get the number of streaming multiprocessors
@@ -621,7 +633,7 @@ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
}
}
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
@@ -661,34 +673,39 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
* disable cudaFuncAttributeNonPortableClusterSizeAllowed */
if (max_shared_memory < partial_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size, device_programmable_bootstrap_tbc<Torus, params, NOSM>,
&config));
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>, &config));
device_programmable_bootstrap_tbc<InputTorus, Torus, params, NOSM>,
&config));
} else if (max_shared_memory < full_sm + minimum_sm_tbc) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, PARTIALSM>,
&config));
} else {
if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
params, FULLSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_programmable_bootstrap_tbc_2_2_params<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc_2_2_params<InputTorus, Torus,
params, FULLSM>,
&config));
} else {
check_cuda_error(cudaFuncSetAttribute(
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_programmable_bootstrap_tbc<Torus, params, FULLSM>, &config));
device_programmable_bootstrap_tbc<InputTorus, Torus, params, FULLSM>,
&config));
}
}

View File

@@ -18,13 +18,14 @@
#include "types/complex/operations.cuh"
#include <vector>
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_tbc_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension,
@@ -67,7 +68,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// The first dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
@@ -87,7 +88,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
if (lwe_offset == 0) {
// Put "b" in [0, 2N[
Torus b_hat = 0;
InputTorus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
@@ -197,13 +198,14 @@ __global__ void __launch_bounds__(params::degree / params::opt)
//- Use a register based fft that uses the minimal synchronizations
//- Register based fourier domain multiplication. Transfer fft's between blocks
// instead of accumulator.
template <typename Torus, class params, sharedMemDegree SMD>
template <typename InputTorus, typename Torus, class params,
sharedMemDegree SMD>
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const InputTorus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t lwe_offset,
@@ -247,7 +249,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
// The first dimension of the block is used to determine on which ciphertext
// this block is operating, in the case of batch bootstraps
const Torus *block_lwe_array_in =
const InputTorus *block_lwe_array_in =
&lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
const Torus *block_lut_vector =
@@ -266,7 +268,7 @@ __global__ void __launch_bounds__(params::degree / params::opt)
Torus reg_acc_rotated[params::opt];
if (lwe_offset == 0) {
// Put "b" in [0, 2N[
Torus b_hat = 0;
InputTorus b_hat = 0;
modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
params::log2_degree + 1);
@@ -400,7 +402,7 @@ uint64_t get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap(
return sizeof(Torus) * polynomial_size * 2; // accumulator
}
template <typename Torus, typename params>
template <typename InputTorus, typename Torus, typename params>
__host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index,
pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t glwe_dimension,
@@ -430,20 +432,22 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
if (max_shared_memory < full_sm_keybundle) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_keybundle<InputTorus, Torus,
params, FULLSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
}
@@ -451,58 +455,58 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
if (max_shared_memory <
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, NOSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
minimum_sm_tbc_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, NOSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else if (max_shared_memory <
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, PARTIALSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, PARTIALSM>,
cudaFuncCachePreferShared);
check_cuda_error(cudaGetLastError());
} else {
if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_tbc_accumulate + 2 * minimum_sm_tbc_accumulate));
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
cudaFuncAttributePreferredSharedMemoryCarveout,
cudaSharedmemCarveoutMaxShared));
check_cuda_error(cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
cudaFuncCachePreferShared));
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, FULLSM>,
cudaFuncCachePreferShared);
}
check_cuda_error(cudaGetLastError());
}
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
auto lwe_chunk_size = get_lwe_chunk_size<InputTorus, Torus, params>(
gpu_index, input_lwe_ciphertext_count, polynomial_size, glwe_dimension,
level_count, full_sm_keybundle);
uint64_t size_tracker = 0;
@@ -513,10 +517,10 @@ __host__ uint64_t scratch_tbc_multi_bit_programmable_bootstrap(
return size_tracker;
}
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void execute_tbc_external_product_loop(
cudaStream_t stream, uint32_t gpu_index, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus *lwe_array_out,
Torus const *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
@@ -579,8 +583,8 @@ __host__ void execute_tbc_external_product_loop(
config.dynamicSmemBytes = minimum_dm;
check_cuda_error(cudaLaunchKernelEx(
&config,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, NOSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -591,8 +595,8 @@ __host__ void execute_tbc_external_product_loop(
config.dynamicSmemBytes = partial_dm + minimum_dm;
check_cuda_error(cudaLaunchKernelEx(
&config,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, PARTIALSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -607,22 +611,22 @@ __host__ void execute_tbc_external_product_loop(
config.dynamicSmemBytes = full_dm + 2 * minimum_dm;
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
full_dm + 2 * minimum_dm));
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
cudaFuncAttributePreferredSharedMemoryCarveout,
cudaSharedmemCarveoutMaxShared));
check_cuda_error(cudaFuncSetCacheConfig(
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
cudaFuncCachePreferShared));
check_cuda_error(cudaLaunchKernelEx(
&config,
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, lwe_offset, chunk_size,
@@ -630,8 +634,8 @@ __host__ void execute_tbc_external_product_loop(
} else {
check_cuda_error(cudaLaunchKernelEx(
&config,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, FULLSM>,
lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
@@ -642,11 +646,11 @@ __host__ void execute_tbc_external_product_loop(
}
}
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ void host_tbc_multi_bit_programmable_bootstrap(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus const *lwe_output_indexes, Torus const *lut_vector,
Torus const *lut_vector_indexes, Torus const *lwe_array_in,
Torus const *lut_vector_indexes, InputTorus const *lwe_array_in,
Torus const *lwe_input_indexes, Torus const *bootstrapping_key,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
@@ -659,13 +663,13 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
lwe_offset += lwe_chunk_size) {
// Compute a keybundle
execute_compute_keybundle<Torus, params>(
execute_compute_keybundle<InputTorus, Torus, params>(
stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
grouping_factor, level_count, lwe_offset);
// Accumulate
execute_tbc_external_product_loop<Torus, params>(
execute_tbc_external_product_loop<InputTorus, Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
num_samples, lwe_dimension, glwe_dimension, polynomial_size,
@@ -690,7 +694,7 @@ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
}
}
template <typename Torus, class params>
template <typename InputTorus, typename Torus, class params>
__host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count, uint32_t max_shared_memory) {
@@ -732,45 +736,45 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
if (max_shared_memory <
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, NOSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
NOSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, NOSM>,
&config));
} else if (max_shared_memory <
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, PARTIALSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
PARTIALSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, PARTIALSM>,
&config));
} else {
if (polynomial_size == 2048 && level_count == 1 && glwe_dimension == 1) {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_multi_bit_programmable_bootstrap_tbc_accumulate_2_2_params<
Torus, params, FULLSM>,
InputTorus, Torus, params, FULLSM>,
&config));
} else {
check_cuda_error(cudaFuncSetAttribute(
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, FULLSM>,
cudaFuncAttributeNonPortableClusterSizeAllowed, false));
check_cuda_error(cudaOccupancyMaxPotentialClusterSize(
&cluster_size,
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
FULLSM>,
device_multi_bit_programmable_bootstrap_tbc_accumulate<
InputTorus, Torus, params, FULLSM>,
&config));
}
}

View File

@@ -166,14 +166,15 @@ public:
#if CUDA_ARCH >= 900
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
(benchmark::State &st) {
if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t,
uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
pbs_level, cuda_get_max_shared_memory(0))) {
st.SkipWithError("Configuration not supported for tbc operation");
return;
}
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t, uint64_t>(
stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
true);
@@ -181,7 +182,8 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t,
uint64_t>(
stream, gpu_index, d_lwe_ct_out_array, d_lwe_output_indexes,
d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
@@ -197,7 +199,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
(benchmark::State &st) {
if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit_64(
glwe_dimension, polynomial_size, pbs_level,
input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) {
st.SkipWithError("Configuration not supported for fast operation");
@@ -254,7 +256,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
#if CUDA_ARCH >= 900
BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
(benchmark::State &st) {
if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t, uint64_t>(
input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
pbs_level, cuda_get_max_shared_memory(0))) {
st.SkipWithError("Configuration not supported for tbc operation");
@@ -269,7 +271,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t, uint64_t>(
stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,
@@ -301,7 +303,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
uint32_t lut_stride = 0;
for (auto _ : st) {
// Execute PBS
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t, uint64_t>(
stream, gpu_index, (uint64_t *)d_lwe_ct_out_array,
(uint64_t *)d_lwe_output_indexes, (uint64_t *)d_lut_pbs_identity,
(uint64_t *)d_lut_pbs_indexes, (uint64_t *)d_lwe_ct_in_array,

View File

@@ -187,7 +187,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
(ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
(lwe_dimension + 1));
// Execute PBS
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
stream, gpu_index, (void *)d_lwe_ct_out_array,
(void *)d_lwe_output_indexes, (void *)d_lut_pbs_identity,
(void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,

View File

@@ -3020,6 +3020,20 @@ unsafe extern "C" {
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_32_64(
stream: *mut ffi::c_void,
gpu_index: u32,
buffer: *mut *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
noise_reduction_type: PBS_MS_REDUCTION_T,
) -> u64;
}
unsafe extern "C" {
pub fn scratch_cuda_programmable_bootstrap_128(
stream: *mut ffi::c_void,
@@ -3035,7 +3049,7 @@ unsafe extern "C" {
) -> u64;
}
unsafe extern "C" {
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
@@ -3057,7 +3071,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
pub fn cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
@@ -3110,6 +3124,7 @@ unsafe extern "C" {
);
}
unsafe extern "C" {
#[link_name = "\u{1}_Z55has_support_to_cuda_programmable_bootstrap_cg_multi_bitjjjjj"]
pub fn has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
glwe_dimension: u32,
polynomial_size: u32,
@@ -3144,6 +3159,18 @@ unsafe extern "C" {
grouping_factor: u32,
);
}
unsafe extern "C" {
pub fn scratch_cuda_multi_bit_programmable_bootstrap_32_64(
stream: *mut ffi::c_void,
gpu_index: u32,
pbs_buffer: *mut *mut i8,
glwe_dimension: u32,
polynomial_size: u32,
level_count: u32,
input_lwe_ciphertext_count: u32,
allocate_gpu_memory: bool,
) -> u64;
}
unsafe extern "C" {
pub fn scratch_cuda_multi_bit_programmable_bootstrap_64(
stream: *mut ffi::c_void,
@@ -3179,6 +3206,29 @@ unsafe extern "C" {
lut_stride: u32,
);
}
unsafe extern "C" {
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
stream: *mut ffi::c_void,
gpu_index: u32,
lwe_array_out: *mut ffi::c_void,
lwe_output_indexes: *const ffi::c_void,
lut_vector: *const ffi::c_void,
lut_vector_indexes: *const ffi::c_void,
lwe_array_in: *const ffi::c_void,
lwe_input_indexes: *const ffi::c_void,
bootstrapping_key: *const ffi::c_void,
buffer: *mut i8,
lwe_dimension: u32,
glwe_dimension: u32,
polynomial_size: u32,
grouping_factor: u32,
base_log: u32,
level_count: u32,
num_samples: u32,
num_many_lut: u32,
lut_stride: u32,
);
}
unsafe extern "C" {
pub fn cleanup_cuda_multi_bit_programmable_bootstrap(
stream: *mut ffi::c_void,

View File

@@ -415,7 +415,7 @@ mod cuda {
let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
let cpu_keys: CpuKeys<Scalar, KeyswitchScalar> = CpuKeysBuilder::new()
.keyswitch_key(ksk_big_to_small)
.build();
@@ -667,7 +667,8 @@ mod cuda {
&mut encryption_generator,
);
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
let cpu_keys: CpuKeys<Scalar, Scalar> =
CpuKeysBuilder::new().packing_keyswitch_key(pksk).build();
let bench_id;
match get_bench_type() {

View File

@@ -511,12 +511,19 @@ mod cuda {
};
use tfhe::core_crypto::prelude::*;
fn cuda_ks_pbs<Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize>(
fn cuda_ks_pbs<
Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Serialize,
KeyswitchScalar: UnsignedTorus + CastFrom<Scalar> + CastInto<usize>,
>(
c: &mut Criterion,
parameters: &[(String, CryptoParametersRecord<Scalar>)],
) {
let bench_name = "core_crypto::cuda::ks_pbs";
let mut bench_group = c.benchmark_group(bench_name);
let bench_name = format!(
"core_crypto::cuda::ks_pbs::ks{}::pbs{}",
KeyswitchScalar::BITS,
Scalar::BITS
);
let mut bench_group = c.benchmark_group(bench_name.clone());
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
@@ -535,29 +542,57 @@ mod cuda {
continue;
}
// Create the LweSecretKey
let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
params.lwe_dimension.unwrap(),
&mut secret_generator,
);
let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
DynamicDistribution::<KeyswitchScalar>::new_gaussian(
gaussian_lwe_noise_distribution.standard_dev(),
)
}
DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
match KeyswitchScalar::BITS {
32 => uniform_lwe_noise_distribution.bound_log2() - 32,
64 => uniform_lwe_noise_distribution.bound_log2(),
_ => panic!("Unsupported Keyswitch scalar input dtype"),
},
)
}
};
let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
allocate_and_generate_new_binary_lwe_secret_key(
params.lwe_dimension.unwrap(),
&mut secret_generator,
);
let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
allocate_and_generate_new_binary_glwe_secret_key(
params.glwe_dimension.unwrap(),
params.polynomial_size.unwrap(),
&mut secret_generator,
);
let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
let big_lwe_sk = glwe_sk.into_lwe_secret_key();
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
&output_lwe_secret_key,
&input_lwe_secret_key,
&big_lwe_sk,
&lwe_sk,
params.ks_base_log.unwrap(),
params.ks_level.unwrap(),
params.lwe_noise_distribution.unwrap(),
lwe_noise_distribution_ksk,
CiphertextModulus::new_native(),
&mut encryption_generator,
);
let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
allocate_and_generate_new_binary_glwe_secret_key(
params.glwe_dimension.unwrap(),
params.polynomial_size.unwrap(),
&mut secret_generator,
);
let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
let bsk = LweBootstrapKey::new(
Scalar::ZERO,
params.glwe_dimension.unwrap().to_glwe_size(),
@@ -568,7 +603,7 @@ mod cuda {
params.ciphertext_modulus.unwrap(),
);
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new()
.keyswitch_key(ksk_big_to_small)
.bootstrap_key(bsk)
.build();
@@ -582,7 +617,7 @@ mod cuda {
// Allocate a new LweCiphertext and encrypt our plaintext
let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
&output_lwe_secret_key,
&big_lwe_sk_64,
Plaintext(Scalar::ZERO),
params.lwe_noise_distribution.unwrap(),
params.ciphertext_modulus.unwrap(),
@@ -591,10 +626,10 @@ mod cuda {
let input_ks_ct_gpu =
CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams);
let output_ks_ct: LweCiphertextOwned<Scalar> = LweCiphertext::new(
Scalar::ZERO,
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
params.ciphertext_modulus.unwrap(),
let output_ks_ct: LweCiphertextOwned<KeyswitchScalar> = LweCiphertext::new(
KeyswitchScalar::ZERO,
lwe_sk.lwe_dimension().to_lwe_size(),
ciphertext_modulus_out,
);
let mut output_ks_ct_gpu =
CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams);
@@ -611,7 +646,7 @@ mod cuda {
// Allocate the LweCiphertext to store the result of the PBS
let output_pbs_ct = LweCiphertext::new(
Scalar::ZERO,
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
params.ciphertext_modulus.unwrap(),
);
let mut output_pbs_ct_gpu =
@@ -671,12 +706,12 @@ mod cuda {
.map(|i| {
let mut input_ks_list = LweCiphertextList::new(
Scalar::ZERO,
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
LweCiphertextCount(elements_per_stream),
params.ciphertext_modulus.unwrap(),
);
encrypt_lwe_ciphertext_list(
&output_lwe_secret_key,
&big_lwe_sk_64,
&mut input_ks_list,
&plaintext_list,
params.lwe_noise_distribution.unwrap(),
@@ -692,10 +727,10 @@ mod cuda {
let output_ks_cts = (0..gpu_count)
.map(|i| {
let output_ks_list = LweCiphertextList::new(
Scalar::ZERO,
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
KeyswitchScalar::ZERO,
lwe_sk.lwe_dimension().to_lwe_size(),
LweCiphertextCount(elements_per_stream),
params.ciphertext_modulus.unwrap(),
ciphertext_modulus_out,
);
CudaLweCiphertextList::from_lwe_ciphertext_list(
&output_ks_list,
@@ -724,7 +759,7 @@ mod cuda {
.map(|i| {
let output_pbs_ct = LweCiphertextList::new(
Scalar::ZERO,
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
LweCiphertextCount(elements_per_stream),
params.ciphertext_modulus.unwrap(),
);
@@ -822,12 +857,18 @@ mod cuda {
fn cuda_multi_bit_ks_pbs<
Scalar: UnsignedTorus + CastInto<usize> + CastFrom<u64> + Default + Serialize + Sync,
KeyswitchScalar: UnsignedTorus + CastFrom<Scalar> + CastInto<usize>,
>(
c: &mut Criterion,
parameters: &[(String, CryptoParametersRecord<Scalar>, LweBskGroupingFactor)],
) {
let bench_name = "core_crypto::cuda::multi_bit_ks_pbs";
let mut bench_group = c.benchmark_group(bench_name);
let bench_name = format!(
"core_crypto::cuda::multi_bit_ks_pbs::ks_pbs::ks{}::pbs{}",
KeyswitchScalar::BITS,
Scalar::BITS
);
let mut bench_group = c.benchmark_group(bench_name.clone());
bench_group
.sample_size(10)
.measurement_time(std::time::Duration::from_secs(30));
@@ -846,29 +887,57 @@ mod cuda {
continue;
}
// Create the LweSecretKey
let input_lwe_secret_key = allocate_and_generate_new_binary_lwe_secret_key(
params.lwe_dimension.unwrap(),
&mut secret_generator,
);
let output_glwe_secret_key: GlweSecretKeyOwned<Scalar> =
let lwe_noise_distribution_ksk = match params.lwe_noise_distribution.unwrap() {
DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
DynamicDistribution::<KeyswitchScalar>::new_gaussian(
gaussian_lwe_noise_distribution.standard_dev(),
)
}
DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
DynamicDistribution::<KeyswitchScalar>::new_t_uniform(
match KeyswitchScalar::BITS {
32 => uniform_lwe_noise_distribution.bound_log2() - 32,
64 => uniform_lwe_noise_distribution.bound_log2(),
_ => panic!("Unsupported Keyswitch scalar input dtype"),
},
)
}
};
let lwe_sk: LweSecretKeyOwned<KeyswitchScalar> =
allocate_and_generate_new_binary_lwe_secret_key(
params.lwe_dimension.unwrap(),
&mut secret_generator,
);
let glwe_sk: GlweSecretKeyOwned<KeyswitchScalar> =
allocate_and_generate_new_binary_glwe_secret_key(
params.glwe_dimension.unwrap(),
params.polynomial_size.unwrap(),
&mut secret_generator,
);
let output_lwe_secret_key = output_glwe_secret_key.into_lwe_secret_key();
let big_lwe_sk = glwe_sk.into_lwe_secret_key();
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
&output_lwe_secret_key,
&input_lwe_secret_key,
&big_lwe_sk,
&lwe_sk,
params.ks_base_log.unwrap(),
params.ks_level.unwrap(),
params.lwe_noise_distribution.unwrap(),
lwe_noise_distribution_ksk,
CiphertextModulus::new_native(),
&mut encryption_generator,
);
let glwe_sk_64: GlweSecretKeyOwned<Scalar> =
allocate_and_generate_new_binary_glwe_secret_key(
params.glwe_dimension.unwrap(),
params.polynomial_size.unwrap(),
&mut secret_generator,
);
let big_lwe_sk_64 = glwe_sk_64.into_lwe_secret_key();
let ciphertext_modulus_out = CiphertextModulus::<KeyswitchScalar>::new_native();
let multi_bit_bsk = LweMultiBitBootstrapKey::new(
Scalar::ZERO,
params.glwe_dimension.unwrap().to_glwe_size(),
@@ -880,7 +949,7 @@ mod cuda {
params.ciphertext_modulus.unwrap(),
);
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
let cpu_keys: CpuKeys<_, _> = CpuKeysBuilder::new()
.keyswitch_key(ksk_big_to_small)
.multi_bit_bootstrap_key(multi_bit_bsk)
.build();
@@ -894,7 +963,7 @@ mod cuda {
// Allocate a new LweCiphertext and encrypt our plaintext
let input_ks_ct = allocate_and_encrypt_new_lwe_ciphertext(
&output_lwe_secret_key,
&big_lwe_sk_64,
Plaintext(Scalar::ZERO),
params.lwe_noise_distribution.unwrap(),
params.ciphertext_modulus.unwrap(),
@@ -903,10 +972,10 @@ mod cuda {
let input_ks_ct_gpu =
CudaLweCiphertextList::from_lwe_ciphertext(&input_ks_ct, &streams);
let output_ks_ct: LweCiphertextOwned<Scalar> = LweCiphertext::new(
Scalar::ZERO,
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
params.ciphertext_modulus.unwrap(),
let output_ks_ct: LweCiphertextOwned<KeyswitchScalar> = LweCiphertext::new(
KeyswitchScalar::ZERO,
lwe_sk.lwe_dimension().to_lwe_size(),
ciphertext_modulus_out,
);
let mut output_ks_ct_gpu =
CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &streams);
@@ -923,7 +992,7 @@ mod cuda {
// Allocate the LweCiphertext to store the result of the PBS
let output_pbs_ct = LweCiphertext::new(
Scalar::ZERO,
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
params.ciphertext_modulus.unwrap(),
);
let mut output_pbs_ct_gpu =
@@ -981,12 +1050,12 @@ mod cuda {
.map(|i| {
let mut input_ks_list = LweCiphertextList::new(
Scalar::ZERO,
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
LweCiphertextCount(elements_per_stream),
params.ciphertext_modulus.unwrap(),
);
encrypt_lwe_ciphertext_list(
&output_lwe_secret_key,
&big_lwe_sk_64,
&mut input_ks_list,
&plaintext_list,
params.lwe_noise_distribution.unwrap(),
@@ -1002,10 +1071,10 @@ mod cuda {
let output_ks_cts = (0..gpu_count)
.map(|i| {
let output_ks_list = LweCiphertextList::new(
Scalar::ZERO,
input_lwe_secret_key.lwe_dimension().to_lwe_size(),
KeyswitchScalar::ZERO,
lwe_sk.lwe_dimension().to_lwe_size(),
LweCiphertextCount(elements_per_stream),
params.ciphertext_modulus.unwrap(),
ciphertext_modulus_out,
);
CudaLweCiphertextList::from_lwe_ciphertext_list(
&output_ks_list,
@@ -1034,7 +1103,7 @@ mod cuda {
.map(|i| {
let output_pbs_ct = LweCiphertextList::new(
Scalar::ZERO,
output_lwe_secret_key.lwe_dimension().to_lwe_size(),
big_lwe_sk_64.lwe_dimension().to_lwe_size(),
LweCiphertextCount(elements_per_stream),
params.ciphertext_modulus.unwrap(),
);
@@ -1132,12 +1201,17 @@ mod cuda {
pub fn cuda_ks_pbs_group() {
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
cuda_ks_pbs(&mut criterion, &benchmark_parameters());
cuda_ks_pbs::<u64, u32>(&mut criterion, &benchmark_parameters());
cuda_ks_pbs::<u64, u64>(&mut criterion, &benchmark_parameters());
}
pub fn cuda_multi_bit_ks_pbs_group() {
let mut criterion: Criterion<_> = (Criterion::default()).configure_from_args();
cuda_multi_bit_ks_pbs(
cuda_multi_bit_ks_pbs::<u64, u32>(
&mut criterion,
&multi_bit_benchmark_parameters_with_grouping(),
);
cuda_multi_bit_ks_pbs::<u64, u64>(
&mut criterion,
&multi_bit_benchmark_parameters_with_grouping(),
);

View File

@@ -250,7 +250,7 @@ mod cuda {
}
};
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
let message_modulus: u64 = 1 << 4;
let input_message: u64 = 3;
@@ -493,7 +493,7 @@ mod cuda {
squash_params.ciphertext_modulus,
);
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
let cpu_keys: CpuKeys<_, u64> = CpuKeysBuilder::new()
.multi_bit_bootstrap_key(multi_bit_bsk)
.build();

View File

@@ -978,7 +978,8 @@ mod cuda {
params.ciphertext_modulus.unwrap(),
);
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new().bootstrap_key(bsk).build();
let cpu_keys: CpuKeys<Scalar, Scalar> =
CpuKeysBuilder::new().bootstrap_key(bsk).build();
let bench_id;
@@ -1236,7 +1237,7 @@ mod cuda {
params.ciphertext_modulus.unwrap(),
);
let cpu_keys: CpuKeys<_> = CpuKeysBuilder::new()
let cpu_keys: CpuKeys<Scalar, Scalar> = CpuKeysBuilder::new()
.multi_bit_bootstrap_key(multi_bit_bsk)
.build();

View File

@@ -575,28 +575,28 @@ mod cuda_utils {
}
/// Computing keys in their CPU flavor.
pub struct CpuKeys<T: UnsignedInteger> {
ksk: Option<LweKeyswitchKeyOwned<T>>,
pub struct CpuKeys<T: UnsignedInteger, KST: UnsignedInteger> {
ksk: Option<LweKeyswitchKeyOwned<KST>>,
pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
bsk: Option<LweBootstrapKeyOwned<T>>,
multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
}
impl<T: UnsignedInteger> CpuKeys<T> {
pub fn builder() -> CpuKeysBuilder<T> {
impl<T: UnsignedInteger, KST: UnsignedInteger> CpuKeys<T, KST> {
pub fn builder() -> CpuKeysBuilder<T, KST> {
CpuKeysBuilder::new()
}
}
pub struct CpuKeysBuilder<T: UnsignedInteger> {
ksk: Option<LweKeyswitchKeyOwned<T>>,
pub struct CpuKeysBuilder<T: UnsignedInteger, KST: UnsignedInteger> {
ksk: Option<LweKeyswitchKeyOwned<KST>>,
pksk: Option<LwePackingKeyswitchKeyOwned<T>>,
bsk: Option<LweBootstrapKeyOwned<T>>,
multi_bit_bsk: Option<LweMultiBitBootstrapKeyOwned<T>>,
}
impl<T: UnsignedInteger> CpuKeysBuilder<T> {
pub fn new() -> CpuKeysBuilder<T> {
impl<T: UnsignedInteger, KST: UnsignedInteger> CpuKeysBuilder<T, KST> {
pub fn new() -> CpuKeysBuilder<T, KST> {
Self {
ksk: None,
pksk: None,
@@ -605,7 +605,7 @@ mod cuda_utils {
}
}
pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<T>) -> CpuKeysBuilder<T> {
pub fn keyswitch_key(mut self, ksk: LweKeyswitchKeyOwned<KST>) -> CpuKeysBuilder<T, KST> {
self.ksk = Some(ksk);
self
}
@@ -613,12 +613,12 @@ mod cuda_utils {
pub fn packing_keyswitch_key(
mut self,
pksk: LwePackingKeyswitchKeyOwned<T>,
) -> CpuKeysBuilder<T> {
) -> CpuKeysBuilder<T, KST> {
self.pksk = Some(pksk);
self
}
pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T> {
pub fn bootstrap_key(mut self, bsk: LweBootstrapKeyOwned<T>) -> CpuKeysBuilder<T, KST> {
self.bsk = Some(bsk);
self
}
@@ -626,12 +626,12 @@ mod cuda_utils {
pub fn multi_bit_bootstrap_key(
mut self,
mb_bsk: LweMultiBitBootstrapKeyOwned<T>,
) -> CpuKeysBuilder<T> {
) -> CpuKeysBuilder<T, KST> {
self.multi_bit_bsk = Some(mb_bsk);
self
}
pub fn build(self) -> CpuKeys<T> {
pub fn build(self) -> CpuKeys<T, KST> {
CpuKeys {
ksk: self.ksk,
pksk: self.pksk,
@@ -640,7 +640,7 @@ mod cuda_utils {
}
}
}
impl<T: UnsignedInteger> Default for CpuKeysBuilder<T> {
impl<T: UnsignedInteger, KST: UnsignedInteger> Default for CpuKeysBuilder<T, KST> {
fn default() -> Self {
Self::new()
}
@@ -648,17 +648,17 @@ mod cuda_utils {
/// Computing keys in their Cuda flavor.
#[allow(dead_code)]
pub struct CudaLocalKeys<T: UnsignedInteger> {
pub ksk: Option<CudaLweKeyswitchKey<T>>,
pub struct CudaLocalKeys<T: UnsignedInteger, KST: UnsignedInteger> {
pub ksk: Option<CudaLweKeyswitchKey<KST>>,
pub pksk: Option<CudaLwePackingKeyswitchKey<T>>,
pub bsk: Option<CudaLweBootstrapKey>,
pub multi_bit_bsk: Option<CudaLweMultiBitBootstrapKey<T>>,
}
#[allow(dead_code)]
impl<T: UnsignedInteger> CudaLocalKeys<T> {
impl<T: UnsignedInteger, KST: UnsignedInteger> CudaLocalKeys<T, KST> {
pub fn from_cpu_keys(
cpu_keys: &CpuKeys<T>,
cpu_keys: &CpuKeys<T, KST>,
ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
stream: &CudaStreams,
) -> Self {
@@ -681,10 +681,10 @@ mod cuda_utils {
}
/// Instantiate Cuda computing keys to each available GPU.
pub fn cuda_local_keys_core<T: UnsignedInteger>(
cpu_keys: &CpuKeys<T>,
pub fn cuda_local_keys_core<T: UnsignedInteger, KST: UnsignedInteger>(
cpu_keys: &CpuKeys<T, KST>,
ms_noise_reduction: Option<CudaModulusSwitchNoiseReductionConfiguration>,
) -> Vec<CudaLocalKeys<T>> {
) -> Vec<CudaLocalKeys<T, KST>> {
let gpu_count = get_number_of_gpus() as usize;
let mut gpu_keys_vec = Vec::with_capacity(gpu_count);
for i in 0..gpu_count {

View File

@@ -10,8 +10,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until streams is synchronised
#[allow(clippy::too_many_arguments)]
pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
input: &CudaLweCiphertextList<Scalar>,
pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<InputScalar, Scalar>(
input: &CudaLweCiphertextList<InputScalar>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<Scalar>,
@@ -20,6 +20,7 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
streams: &CudaStreams,
) where
InputScalar: UnsignedTorus + CastInto<usize>,
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{
@@ -59,21 +60,22 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
multi_bit_bsk.polynomial_size(),
);
assert_eq!(
input.ciphertext_modulus(),
output.ciphertext_modulus(),
assert!(
input.ciphertext_modulus().associated_scalar_bits()
<= output.ciphertext_modulus().associated_scalar_bits(),
"Mismatched CiphertextModulus between input ({:?}) and output ({:?})",
input.ciphertext_modulus(),
output.ciphertext_modulus(),
);
assert_eq!(
input.ciphertext_modulus(),
output.ciphertext_modulus(),
accumulator.ciphertext_modulus(),
"Mismatched CiphertextModulus between input ({:?}) and accumulator ({:?})",
"Mismatched CiphertextModulus between output ({:?}) and accumulator ({:?})",
input.ciphertext_modulus(),
accumulator.ciphertext_modulus(),
);
assert_eq!(
streams.gpu_indexes[0],
multi_bit_bsk.d_vec.gpu_index(0),
@@ -144,8 +146,8 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
}
#[allow(clippy::too_many_arguments)]
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
input: &CudaLweCiphertextList<Scalar>,
pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<InputScalar, Scalar>(
input: &CudaLweCiphertextList<InputScalar>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<Scalar>,
@@ -154,6 +156,7 @@ pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext<Scalar>(
multi_bit_bsk: &CudaLweMultiBitBootstrapKey<Scalar>,
streams: &CudaStreams,
) where
InputScalar: UnsignedTorus + CastInto<usize>,
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{

View File

@@ -12,8 +12,8 @@ use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
/// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must not
/// be dropped until streams is synchronised
#[allow(clippy::too_many_arguments)]
pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
input: &CudaLweCiphertextList<Scalar>,
pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<InputScalar, Scalar>(
input: &CudaLweCiphertextList<InputScalar>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<Scalar>,
@@ -22,6 +22,7 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
bsk: &CudaLweBootstrapKey,
streams: &CudaStreams,
) where
InputScalar: UnsignedTorus + CastInto<usize>,
// CastInto required for PBS modulus switch which returns a usize
Scalar: UnsignedTorus + CastInto<usize>,
{
@@ -259,8 +260,8 @@ pub unsafe fn cuda_programmable_bootstrap_128_lwe_ciphertext_async<Scalar>(
}
#[allow(clippy::too_many_arguments)]
pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
input: &CudaLweCiphertextList<Scalar>,
pub fn cuda_programmable_bootstrap_lwe_ciphertext<InputScalar, Scalar>(
input: &CudaLweCiphertextList<InputScalar>,
output: &mut CudaLweCiphertextList<Scalar>,
accumulator: &CudaGlweCiphertextList<Scalar>,
lut_indexes: &CudaVec<Scalar>,
@@ -269,6 +270,7 @@ pub fn cuda_programmable_bootstrap_lwe_ciphertext<Scalar>(
bsk: &CudaLweBootstrapKey,
streams: &CudaStreams,
) where
InputScalar: UnsignedTorus + CastInto<usize>,
Scalar: UnsignedTorus + CastInto<usize>,
{
unsafe {

View File

@@ -4,8 +4,11 @@ use crate::core_crypto::gpu::lwe_bootstrap_key::{
CudaLweBootstrapKey, CudaModulusSwitchNoiseReductionConfiguration,
};
use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
use crate::core_crypto::gpu::lwe_keyswitch_key::CudaLweKeyswitchKey;
use crate::core_crypto::gpu::vec::{CudaVec, GpuIndex};
use crate::core_crypto::gpu::{cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams};
use crate::core_crypto::gpu::{
cuda_keyswitch_lwe_ciphertext, cuda_programmable_bootstrap_lwe_ciphertext, CudaStreams,
};
use crate::core_crypto::prelude::misc::check_encrypted_content_respects_mod;
use itertools::Itertools;
@@ -174,6 +177,218 @@ fn lwe_encrypt_pbs_decrypt<
}
}
fn lwe_ks_encrypt_pbs_decrypt_compare_32_64<
Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize> + CastFrom<u32> + CastInto<u32>,
>(
params: ClassicTestParams<Scalar>,
) {
assert!(Scalar::BITS <= 64);
let input_lwe_dimension = params.lwe_dimension;
let lwe_noise_distribution = params.lwe_noise_distribution;
let glwe_noise_distribution = params.glwe_noise_distribution;
let ciphertext_modulus = params.ciphertext_modulus;
let message_modulus_log = params.message_modulus_log;
let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
let glwe_dimension = params.glwe_dimension;
let polynomial_size = params.polynomial_size;
let decomp_base_log = params.pbs_base_log;
let decomp_level_count = params.pbs_level;
let gpu_index = 0;
let stream = CudaStreams::new_single_gpu(GpuIndex::new(gpu_index));
let mut rsc = TestResources::new();
let f = |x: Scalar| {
x.wrapping_mul(Scalar::TWO)
.wrapping_sub(Scalar::ONE)
.wrapping_rem(msg_modulus)
};
let delta: Scalar = encoding_with_padding / msg_modulus;
let mut msg = msg_modulus;
const NB_TESTS: usize = 10;
let number_of_messages = 1;
let accumulator = generate_programmable_bootstrap_glwe_lut(
polynomial_size,
glwe_dimension.to_glwe_size(),
msg_modulus.cast_into(),
ciphertext_modulus,
delta,
f,
);
assert!(check_encrypted_content_respects_mod(
&accumulator,
ciphertext_modulus
));
let lwe_noise_distribution_ksk_32 = match params.lwe_noise_distribution {
DynamicDistribution::Gaussian(gaussian_lwe_noise_distribution) => {
DynamicDistribution::<u32>::new_gaussian(gaussian_lwe_noise_distribution.standard_dev())
}
DynamicDistribution::TUniform(uniform_lwe_noise_distribution) => {
DynamicDistribution::<u32>::new_t_uniform(
uniform_lwe_noise_distribution.bound_log2() - 32,
)
}
};
// Create the LweSecretKey
let intermediate_lwe_secret_key_32: LweSecretKeyOwned<u32> =
allocate_and_generate_new_binary_lwe_secret_key(
input_lwe_dimension,
&mut rsc.secret_random_generator,
); // lwe_sk
let glwe_sk_64: GlweSecretKeyOwned<Scalar> = allocate_and_generate_new_binary_glwe_secret_key(
params.glwe_dimension,
params.polynomial_size,
&mut rsc.secret_random_generator,
);
let big_lwe_sk_64 = glwe_sk_64.clone().into_lwe_secret_key();
let ksk_big_to_small = allocate_and_generate_new_lwe_keyswitch_key(
&big_lwe_sk_64,
&intermediate_lwe_secret_key_32,
params.ks_base_log,
params.ks_level,
lwe_noise_distribution_ksk_32,
CiphertextModulus::<u32>::new_native(),
&mut rsc.encryption_random_generator,
);
let d_ksk_big_to_small =
CudaLweKeyswitchKey::from_lwe_keyswitch_key(&ksk_big_to_small, &stream);
let ciphertext_modulus_ks32 = CiphertextModulus::<u32>::new_native();
let output_lwe_dimension = big_lwe_sk_64.lwe_dimension();
let mut bsk = LweBootstrapKey::new(
Scalar::ZERO,
glwe_dimension.to_glwe_size(),
polynomial_size,
decomp_base_log,
decomp_level_count,
input_lwe_dimension,
ciphertext_modulus,
);
par_generate_lwe_bootstrap_key(
&intermediate_lwe_secret_key_32,
&glwe_sk_64,
&mut bsk,
glwe_noise_distribution,
&mut rsc.encryption_random_generator,
);
assert!(check_encrypted_content_respects_mod(
&*bsk,
ciphertext_modulus
));
let d_bsk = CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, None, &stream);
while msg != Scalar::ZERO {
msg = msg.wrapping_sub(Scalar::ONE);
for _ in 0..NB_TESTS {
let plaintext = Plaintext(msg * delta);
let lwe_ciphertext_in = allocate_and_encrypt_new_lwe_ciphertext(
&big_lwe_sk_64,
plaintext,
lwe_noise_distribution,
ciphertext_modulus,
&mut rsc.encryption_random_generator,
);
assert!(check_encrypted_content_respects_mod(
&lwe_ciphertext_in,
ciphertext_modulus
));
let d_lwe_ciphertext_in =
CudaLweCiphertextList::from_lwe_ciphertext(&lwe_ciphertext_in, &stream);
let output_ks_ct: LweCiphertextOwned<u32> = LweCiphertext::new(
0u32,
intermediate_lwe_secret_key_32.lwe_dimension().to_lwe_size(),
ciphertext_modulus_ks32,
);
let mut output_ks_ct_gpu =
CudaLweCiphertextList::from_lwe_ciphertext(&output_ks_ct, &stream);
let mut d_out_pbs_ct = CudaLweCiphertextList::new(
output_lwe_dimension,
LweCiphertextCount(1),
ciphertext_modulus,
&stream,
);
let d_accumulator = CudaGlweCiphertextList::from_glwe_ciphertext(&accumulator, &stream);
let test_vector_indexes: Vec<Scalar> = vec![Scalar::ZERO; number_of_messages];
let mut d_test_vector_indexes =
unsafe { CudaVec::<Scalar>::new_async(number_of_messages, &stream, 0) };
unsafe { d_test_vector_indexes.copy_from_cpu_async(&test_vector_indexes, &stream, 0) };
let num_blocks = d_lwe_ciphertext_in.0.lwe_ciphertext_count.0;
let lwe_indexes_usize: Vec<usize> = (0..num_blocks).collect_vec();
let lwe_indexes = lwe_indexes_usize
.iter()
.map(|&x| <usize as CastInto<Scalar>>::cast_into(x))
.collect_vec();
let mut d_output_indexes =
unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
let mut d_input_indexes =
unsafe { CudaVec::<Scalar>::new_async(num_blocks, &stream, 0) };
unsafe {
d_input_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
d_output_indexes.copy_from_cpu_async(&lwe_indexes, &stream, 0);
}
cuda_keyswitch_lwe_ciphertext(
&d_ksk_big_to_small,
&d_lwe_ciphertext_in,
&mut output_ks_ct_gpu,
&d_input_indexes,
&d_output_indexes,
true,
&stream,
false,
);
cuda_programmable_bootstrap_lwe_ciphertext(
&output_ks_ct_gpu,
&mut d_out_pbs_ct,
&d_accumulator,
&d_test_vector_indexes,
&d_output_indexes,
&d_input_indexes,
&d_bsk,
&stream,
);
let out_pbs_ct = d_out_pbs_ct.into_lwe_ciphertext(&stream);
assert!(check_encrypted_content_respects_mod(
&out_pbs_ct,
ciphertext_modulus
));
let decrypted = decrypt_lwe_ciphertext(&big_lwe_sk_64, &out_pbs_ct);
let decoded = round_decode(decrypted.0, delta) % msg_modulus;
assert_eq!(decoded, f(msg));
}
}
}
fn lwe_encrypt_centered_ms_pbs_decrypt<
Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize>,
>(
@@ -327,6 +542,6 @@ fn lwe_encrypt_centered_ms_pbs_decrypt<
}
}
}
create_gpu_parameterized_test!(lwe_ks_encrypt_pbs_decrypt_compare_32_64);
create_gpu_parameterized_test!(lwe_encrypt_pbs_decrypt);
create_gpu_parameterized_test!(lwe_encrypt_centered_ms_pbs_decrypt);

View File

@@ -166,13 +166,13 @@ impl Drop for CudaStreams {
/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
/// required
#[allow(clippy::too_many_arguments)]
pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
pub unsafe fn programmable_bootstrap_async<KST: UnsignedInteger, T: UnsignedInteger>(
streams: &CudaStreams,
lwe_array_out: &mut CudaVec<T>,
lwe_out_indexes: &CudaVec<T>,
test_vector: &CudaVec<T>,
test_vector_indexes: &CudaVec<T>,
lwe_array_in: &CudaVec<T>,
lwe_array_in: &CudaVec<KST>,
lwe_in_indexes: &CudaVec<T>,
bootstrapping_key: &CudaVec<f64>,
lwe_dimension: LweDimension,
@@ -193,45 +193,89 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
PBSMSNoiseReductionType::Centered
});
scratch_cuda_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
noise_reduction_type as u32,
);
if KST::BITS == 32 {
scratch_cuda_programmable_bootstrap_32_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
noise_reduction_type as u32,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_32_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
cleanup_cuda_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
} else if KST::BITS == 64 {
scratch_cuda_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
noise_reduction_type as u32,
);
cuda_programmable_bootstrap_lwe_ciphertext_vector_64_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
lwe_out_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
lwe_in_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
} else {
panic!("Unsupported PBS input bitwidth {}", KST::BITS);
}
}
#[allow(clippy::too_many_arguments)]
@@ -348,6 +392,7 @@ pub unsafe fn programmable_bootstrap_128_async<T: UnsignedInteger>(
#[allow(clippy::too_many_arguments)]
pub unsafe fn programmable_bootstrap_multi_bit_async<
T: UnsignedInteger,
KST: UnsignedInteger,
B: Any + UnsignedInteger,
>(
streams: &CudaStreams,
@@ -355,7 +400,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
output_indexes: &CudaVec<T>,
test_vector: &CudaVec<B>,
test_vector_indexes: &CudaVec<T>,
lwe_array_in: &CudaVec<T>,
lwe_array_in: &CudaVec<KST>,
input_indexes: &CudaVec<T>,
bootstrapping_key: &CudaVec<B>,
lwe_dimension: LweDimension,
@@ -406,42 +451,83 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<
std::ptr::addr_of_mut!(pbs_buffer),
);
} else if TypeId::of::<B>() == TypeId::of::<u64>() {
scratch_cuda_multi_bit_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
output_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
input_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_multi_bit_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
if TypeId::of::<KST>() == TypeId::of::<u32>() {
scratch_cuda_multi_bit_programmable_bootstrap_32_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_32_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
output_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
input_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_multi_bit_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
} else if TypeId::of::<KST>() == TypeId::of::<u64>() {
scratch_cuda_multi_bit_programmable_bootstrap_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
level.0 as u32,
num_samples,
true,
);
cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
streams.ptr[0],
streams.gpu_indexes[0].get(),
lwe_array_out.as_mut_c_ptr(0),
output_indexes.as_c_ptr(0),
test_vector.as_c_ptr(0),
test_vector_indexes.as_c_ptr(0),
lwe_array_in.as_c_ptr(0),
input_indexes.as_c_ptr(0),
bootstrapping_key.as_c_ptr(0),
pbs_buffer,
lwe_dimension.0 as u32,
glwe_dimension.0 as u32,
polynomial_size.0 as u32,
grouping_factor.0 as u32,
base_log.0 as u32,
level.0 as u32,
num_samples,
num_many_lut,
lut_stride,
);
cleanup_cuda_multi_bit_programmable_bootstrap(
streams.ptr[0],
streams.gpu_indexes[0].get(),
std::ptr::addr_of_mut!(pbs_buffer),
);
} else {
panic!("Unsupported MB PBS input torus size");
}
} else {
panic!("Unsupported torus size")
}