mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
feat(cuda): support N=4096 and 8192 for the low latency bootstrap
This commit is contained in:
@@ -73,8 +73,8 @@ void cuda_extract_bits_32(
|
||||
void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk,
|
||||
uint32_t number_of_samples);
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_extract_bits_64(
|
||||
void *v_stream, uint32_t gpu_index, void *list_lwe_array_out,
|
||||
@@ -84,8 +84,8 @@ void cuda_extract_bits_64(
|
||||
void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk,
|
||||
uint32_t number_of_samples);
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory);
|
||||
}
|
||||
|
||||
#ifdef __CUDACC__
|
||||
|
||||
@@ -68,9 +68,10 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048",
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048));
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
@@ -88,21 +89,35 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors);
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors);
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors);
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -121,9 +136,10 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
assert(("Error (GPU low latency PBS): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048",
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048));
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
// The number of samples should be lower than SM/(4 * (k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
@@ -141,21 +157,35 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors);
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors);
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors);
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size, base_log,
|
||||
level_count, num_samples, num_lut_vectors, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -117,7 +117,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
|
||||
correction_inverse_fft_inplace<params>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the GGSW x GWE and add to the
|
||||
// Perform the inverse FFT on the result of the GGSW x GLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
@@ -127,7 +127,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *mask_join_buffer,
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
/*
|
||||
* Kernel launched by the low latency version of the
|
||||
* bootstrapping, that uses cooperative groups
|
||||
@@ -142,7 +142,8 @@ __global__ void device_bootstrap_low_latency(
|
||||
Torus *lwe_array_out, Torus *lut_vector, Torus *lwe_array_in,
|
||||
double2 *bootstrapping_key, double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer, uint32_t lwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t base_log, uint32_t level_count) {
|
||||
uint32_t base_log, uint32_t level_count, char *device_mem,
|
||||
int device_memory_size_per_block) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
@@ -150,8 +151,14 @@ __global__ void device_bootstrap_low_latency(
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ char sharedmem[];
|
||||
char *selected_memory;
|
||||
int block_index =
|
||||
blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
|
||||
|
||||
char *selected_memory = sharedmem;
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = sharedmem;
|
||||
else
|
||||
selected_memory = &device_mem[block_index * device_memory_size_per_block];
|
||||
|
||||
Torus *accumulator = (Torus *)selected_memory;
|
||||
Torus *accumulator_rotated =
|
||||
@@ -159,6 +166,8 @@ __global__ void device_bootstrap_low_latency(
|
||||
double2 *accumulator_fft =
|
||||
(double2 *)accumulator_rotated +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
if constexpr (SMD == PARTIALSM)
|
||||
accumulator_fft = (double2 *)sharedmem;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
@@ -251,25 +260,37 @@ __host__ void host_bootstrap_low_latency(
|
||||
uint32_t *lut_vector_indexes, Torus *lwe_array_in,
|
||||
double2 *bootstrapping_key, uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
|
||||
uint32_t num_samples, uint32_t num_lut_vectors) {
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size_per_gpu =
|
||||
level_count * num_samples * polynomial_size / 2 * sizeof(double2);
|
||||
int buffer_size_per_gpu = level_count * input_lwe_ciphertext_count *
|
||||
polynomial_size / 2 * sizeof(double2);
|
||||
double2 *mask_buffer_fft =
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
|
||||
double2 *body_buffer_fft =
|
||||
(double2 *)cuda_malloc_async(buffer_size_per_gpu, *stream, gpu_index);
|
||||
|
||||
int bytes_needed = sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
// With SM each block corresponds to either the mask or body, no need to
|
||||
// duplicate data for each
|
||||
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator_rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
|
||||
int SM_PART =
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
|
||||
|
||||
int DM_FULL = SM_FULL;
|
||||
|
||||
int DM_PART = DM_FULL - SM_PART;
|
||||
|
||||
char *d_mem;
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(level_count, 2, num_samples);
|
||||
dim3 grid(level_count, 2, input_lwe_ciphertext_count);
|
||||
|
||||
void *kernel_args[10];
|
||||
void *kernel_args[12];
|
||||
kernel_args[0] = &lwe_array_out;
|
||||
kernel_args[1] = &lut_vector;
|
||||
kernel_args[2] = &lwe_array_in;
|
||||
@@ -280,22 +301,55 @@ __host__ void host_bootstrap_low_latency(
|
||||
kernel_args[7] = &polynomial_size;
|
||||
kernel_args[8] = &base_log;
|
||||
kernel_args[9] = &level_count;
|
||||
kernel_args[10] = &d_mem;
|
||||
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, bytes_needed));
|
||||
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
if (max_shared_memory < SM_PART) {
|
||||
kernel_args[11] = &DM_FULL;
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
d_mem = (char *)cuda_malloc_async(DM_FULL * input_lwe_ciphertext_count *
|
||||
level_count * 2,
|
||||
*stream, gpu_index);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, NOSM>, grid, thds,
|
||||
(void **)kernel_args, 0, *stream));
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
kernel_args[11] = &DM_PART;
|
||||
d_mem = (char *)cuda_malloc_async(DM_PART * input_lwe_ciphertext_count *
|
||||
level_count * 2,
|
||||
*stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_PART));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_low_latency<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, PARTIALSM>, grid,
|
||||
thds, (void **)kernel_args, SM_PART, *stream));
|
||||
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params>, grid, thds,
|
||||
(void **)kernel_args, bytes_needed, *stream));
|
||||
} else {
|
||||
int DM_NONE = 0;
|
||||
kernel_args[11] = &DM_NONE;
|
||||
d_mem = (char *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL));
|
||||
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel(
|
||||
(void *)device_bootstrap_low_latency<Torus, params, FULLSM>, grid, thds,
|
||||
(void **)kernel_args, SM_FULL, *stream));
|
||||
}
|
||||
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
// Synchronize the streams before copying the result to lwe_array_out at the
|
||||
// right place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cuda_drop_async(mask_buffer_fft, *stream, gpu_index);
|
||||
cuda_drop_async(body_buffer_fft, *stream, gpu_index);
|
||||
cuda_drop_async(d_mem, *stream, gpu_index);
|
||||
}
|
||||
|
||||
#endif // LOWLAT_PBS_H
|
||||
|
||||
@@ -118,16 +118,17 @@ void cuda_extract_bits_32(
|
||||
void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk,
|
||||
uint32_t number_of_samples) {
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 32",
|
||||
base_log_bsk <= 32));
|
||||
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
|
||||
"512, 1024, 2048",
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
|
||||
lwe_dimension_in == 2048));
|
||||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
|
||||
lwe_dimension_in == 8192));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
@@ -151,7 +152,7 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples);
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_extract_bits<uint32_t, Degree<1024>>(
|
||||
@@ -163,7 +164,7 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples);
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_extract_bits<uint32_t, Degree<2048>>(
|
||||
@@ -175,7 +176,31 @@ void cuda_extract_bits_32(
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples);
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_extract_bits<uint32_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer,
|
||||
(uint32_t *)lwe_array_out_ks_buffer,
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_extract_bits<uint32_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
|
||||
(uint32_t *)lwe_array_in, (uint32_t *)lwe_array_in_buffer,
|
||||
(uint32_t *)lwe_array_in_shifted_buffer,
|
||||
(uint32_t *)lwe_array_out_ks_buffer,
|
||||
(uint32_t *)lwe_array_out_pbs_buffer, (uint32_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
@@ -190,16 +215,17 @@ void cuda_extract_bits_64(
|
||||
void *ksk, void *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
|
||||
uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
|
||||
uint32_t glwe_dimension, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk,
|
||||
uint32_t number_of_samples) {
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
assert(("Error (GPU extract bits): base log should be <= 64",
|
||||
base_log_bsk <= 64));
|
||||
assert(("Error (GPU extract bits): glwe_dimension should be equal to 1",
|
||||
glwe_dimension == 1));
|
||||
assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
|
||||
"512, 1024, 2048",
|
||||
"512, 1024, 2048, 4096, 8192",
|
||||
lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
|
||||
lwe_dimension_in == 2048));
|
||||
lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
|
||||
lwe_dimension_in == 8192));
|
||||
// The number of samples should be lower than the number of streaming
|
||||
// multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
|
||||
// to the occupancy of 50%). The only supported value for k is 1, so
|
||||
@@ -223,7 +249,7 @@ void cuda_extract_bits_64(
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples);
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_extract_bits<uint64_t, Degree<1024>>(
|
||||
@@ -235,7 +261,7 @@ void cuda_extract_bits_64(
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples);
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_extract_bits<uint64_t, Degree<2048>>(
|
||||
@@ -247,7 +273,31 @@ void cuda_extract_bits_64(
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples);
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_extract_bits<uint64_t, Degree<4096>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer,
|
||||
(uint64_t *)lwe_array_out_ks_buffer,
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_extract_bits<uint64_t, Degree<8192>>(
|
||||
v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
|
||||
(uint64_t *)lwe_array_in, (uint64_t *)lwe_array_in_buffer,
|
||||
(uint64_t *)lwe_array_in_shifted_buffer,
|
||||
(uint64_t *)lwe_array_out_ks_buffer,
|
||||
(uint64_t *)lwe_array_out_pbs_buffer, (uint64_t *)lut_pbs,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)ksk, (double2 *)fourier_bsk,
|
||||
number_of_bits, delta_log, lwe_dimension_in, lwe_dimension_out,
|
||||
base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
|
||||
number_of_samples, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
||||
@@ -468,8 +468,8 @@ __host__ void host_extract_bits(
|
||||
uint32_t *lut_vector_indexes, Torus *ksk, double2 *fourier_bsk,
|
||||
uint32_t number_of_bits, uint32_t delta_log, uint32_t lwe_dimension_in,
|
||||
uint32_t lwe_dimension_out, uint32_t base_log_bsk, uint32_t level_count_bsk,
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk,
|
||||
uint32_t number_of_samples) {
|
||||
uint32_t base_log_ksk, uint32_t level_count_ksk, uint32_t number_of_samples,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
|
||||
@@ -511,7 +511,7 @@ __host__ void host_extract_bits(
|
||||
v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
|
||||
lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk,
|
||||
lwe_dimension_out, lwe_dimension_in, base_log_bsk, level_count_bsk,
|
||||
number_of_samples, 1);
|
||||
number_of_samples, 1, max_shared_memory);
|
||||
|
||||
add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
|
||||
lwe_array_in_shifted_buffer, lwe_array_in_buffer,
|
||||
|
||||
Reference in New Issue
Block a user