feat(cuda): Add support for the classical PBS for polynomial_size=256.

This commit is contained in:
Pedro Alves
2023-02-13 19:58:46 -03:00
committed by Agnès Leroy
parent 730274f156
commit bfb07b961d
3 changed files with 75 additions and 20 deletions

View File

@@ -13,13 +13,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
assert(
("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
switch (polynomial_size) {
case 256:
host_bootstrap_amortized<uint32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 512:
host_bootstrap_amortized<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
@@ -139,13 +148,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
assert(
("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
switch (polynomial_size) {
case 256:
host_bootstrap_amortized<uint64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 512:
host_bootstrap_amortized<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,

View File

@@ -16,11 +16,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
assert(("Error (GPU low latency PBS): base log should be <= 32",
base_log <= 32));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
assert((
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
@@ -33,6 +34,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
num_samples <= number_of_sm * 4. / 2. / level_count));
switch (polynomial_size) {
case 256:
host_bootstrap_low_latency<uint32_t, Degree<256>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 512:
host_bootstrap_low_latency<uint32_t, Degree<512>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
@@ -163,11 +172,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
assert(("Error (GPU low latency PBS): base log should be <= 64",
base_log <= 64));
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 512 || polynomial_size == 1024 ||
polynomial_size == 2048 || polynomial_size == 4096 ||
polynomial_size == 8192));
assert((
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
@@ -180,6 +190,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
num_samples <= number_of_sm * 4. / 2. / level_count));
switch (polynomial_size) {
case 256:
host_bootstrap_low_latency<uint64_t, Degree<256>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 512:
host_bootstrap_low_latency<uint64_t, Degree<512>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,

View File

@@ -84,6 +84,25 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
double2 *buffer;
switch (polynomial_size) {
case 256:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 512:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);