feat(concrete-cuda): Added classical pbs support for 2^14 polynomial size, added corresponding tests.

This commit is contained in:
Beka Barbakadze
2023-03-20 14:48:13 +04:00
committed by Agnès Leroy
parent 82ab395f71
commit b5839a3993
8 changed files with 4199 additions and 13 deletions

View File

@@ -6,10 +6,11 @@
void checks_fast_bootstrap_amortized(int polynomial_size) {
assert(
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
}
/*
@@ -67,6 +68,11 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
@@ -118,6 +124,11 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
case 16384:
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
break;
default:
break;
}
@@ -185,6 +196,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 16384:
host_bootstrap_amortized<uint32_t, Degree<16384>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
default:
break;
}
@@ -314,6 +333,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
case 16384:
host_bootstrap_amortized<uint64_t, Degree<16384>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
lwe_idx, max_shared_memory);
break;
default:
break;
}

View File

@@ -8,10 +8,11 @@ void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
assert((
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
"1024, 2048, 4096, 8192",
"1024, 2048, 4096, 8192, 16384",
polynomial_size == 256 || polynomial_size == 512 ||
polynomial_size == 1024 || polynomial_size == 2048 ||
polynomial_size == 4096 || polynomial_size == 8192));
polynomial_size == 4096 || polynomial_size == 8192 ||
polynomial_size == 16384));
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
// factor 4 being related to the occupancy of 50%). The only supported
// value for k is 1, so k + 1 = 2 for now.
@@ -89,6 +90,12 @@ void scratch_cuda_bootstrap_low_latency_32(
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 16384:
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
default:
break;
}
@@ -145,6 +152,12 @@ void scratch_cuda_bootstrap_low_latency_64(
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
case 16384:
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<16384>>(
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
level_count, input_lwe_ciphertext_count, max_shared_memory,
allocate_gpu_memory);
break;
default:
break;
}
@@ -217,6 +230,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 16384:
host_bootstrap_low_latency<uint32_t, Degree<16384>>(
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
default:
break;
}
@@ -358,6 +379,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
case 16384:
host_bootstrap_low_latency<uint64_t, Degree<16384>>(
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
max_shared_memory);
break;
default:
break;
}

View File

@@ -198,6 +198,25 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 16384:
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
default:
break;
}

View File

@@ -295,8 +295,8 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
}
// compressed size = 8192 is actual polynomial size = 16384.
// this size is not supported yet by any of the concrete-cuda api.
// may be used in the future.
// from this size, twiddles can't fit in constant memory,
// so from here, butterfly operation access device memory.
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
@@ -306,7 +306,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles[twid_id + 4096];
w = negtwiddles13[twid_id];
u = A[i1];
v.x = A[i2].x * w.x - A[i2].y * w.y;
v.y = A[i2].y * w.x + A[i2].x * w.y;
@@ -353,8 +353,9 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
// butterfly operation is started from last level
// compressed size = 8192 is actual polynomial size = 16384.
// this size is not supported yet by any of the concrete-cuda api.
// may be used in the future.
// twiddles for this size can't fit in constant memory so
// butterfly operation for this level acess device memory to fetch
// twiddles
if constexpr (params::degree >= 8192) {
// level 13
tid = threadIdx.x;
@@ -364,7 +365,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
i1 = 2 * (params::degree / 8192) * twid_id +
(tid & (params::degree / 8192 - 1));
i2 = i1 + params::degree / 8192;
w = negtwiddles[twid_id + 4096];
w = negtwiddles13[twid_id];
u.x = A[i1].x - A[i2].x;
u.y = A[i1].y - A[i2].y;
A[i1].x += A[i2].x;

File diff suppressed because it is too large Load Diff

View File

@@ -2,6 +2,13 @@
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
#define GPU_BOOTSTRAP_TWIDDLES_CUH
extern __constant__ double2 negtwiddles[4096];
/*
* 'negtwiddles' are stored in constant memory for faster access times
* because of it's limitied size, only twiddles for up to 2^12 polynomial size
* can be stored there, twiddles for 2^13 are stored in device memory
* 'negtwiddles13'
*/
extern __constant__ double2 negtwiddles[4096];
extern __device__ double2 negtwiddles13[4096];
#endif

View File

@@ -10,8 +10,10 @@ constexpr int choose_opt(int degree) {
return 8;
else if (degree == 4096)
return 16;
else
else if (degree == 8192)
return 32;
else
return 64;
}
template <class params> class HalfDegree {
public:

View File

@@ -320,7 +320,10 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
2, 1, 50},
(BootstrapTestParams){881, 1, 8192, 0.000007069849454709433,
0.00000000000000029403601535432533, 22, 1, 2, 1,
2, 1, 25});
2, 1, 25},
(BootstrapTestParams){976, 1, 16384, 0.000007069849454709433,
0.00000000000000029403601535432533, 11, 3, 4, 1,
2, 1, 10});
std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
BootstrapTestParams params = p.param;