mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 11:35:02 -05:00
feat(cuda): Add support for the classical PBS for polynomial_size=256.
This commit is contained in:
@@ -13,13 +13,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
|
||||
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_bootstrap_amortized<uint32_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
@@ -139,13 +148,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
|
||||
assert(
|
||||
("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
|
||||
assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_bootstrap_amortized<uint64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
|
||||
@@ -16,11 +16,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 32",
|
||||
base_log <= 32));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
assert((
|
||||
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
@@ -33,6 +34,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
num_samples <= number_of_sm * 4. / 2. / level_count));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
@@ -163,11 +172,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
|
||||
assert(("Error (GPU low latency PBS): base log should be <= 64",
|
||||
base_log <= 64));
|
||||
assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 512 || polynomial_size == 1024 ||
|
||||
polynomial_size == 2048 || polynomial_size == 4096 ||
|
||||
polynomial_size == 8192));
|
||||
assert((
|
||||
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
@@ -180,6 +190,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
num_samples <= number_of_sm * 4. / 2. / level_count));
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<256>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 512:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<512>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
|
||||
@@ -84,6 +84,25 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
|
||||
Reference in New Issue
Block a user