mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 03:25:05 -05:00
feat(concrete-cuda): Added classical pbs support for 2^14 polynomial size, added corresponding tests.
This commit is contained in:
committed by
Agnès Leroy
parent
82ab395f71
commit
b5839a3993
@@ -6,10 +6,11 @@
|
||||
void checks_fast_bootstrap_amortized(int polynomial_size) {
|
||||
assert(
|
||||
("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
"1024, 2048, 4096, 8192, 16384",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
polynomial_size == 4096 || polynomial_size == 8192 ||
|
||||
polynomial_size == 16384));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -67,6 +68,11 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_bootstrap_amortized<uint32_t, int32_t, Degree<16384>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -118,6 +124,11 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_bootstrap_amortized<uint64_t, int64_t, Degree<16384>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -185,6 +196,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_bootstrap_amortized<uint32_t, Degree<16384>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -314,6 +333,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_bootstrap_amortized<uint64_t, Degree<16384>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -8,10 +8,11 @@ void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
|
||||
|
||||
assert((
|
||||
"Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
|
||||
"1024, 2048, 4096, 8192",
|
||||
"1024, 2048, 4096, 8192, 16384",
|
||||
polynomial_size == 256 || polynomial_size == 512 ||
|
||||
polynomial_size == 1024 || polynomial_size == 2048 ||
|
||||
polynomial_size == 4096 || polynomial_size == 8192));
|
||||
polynomial_size == 4096 || polynomial_size == 8192 ||
|
||||
polynomial_size == 16384));
|
||||
// The number of samples should be lower than 4 * SM/((k + 1) * l) (the
|
||||
// factor 4 being related to the occupancy of 50%). The only supported
|
||||
// value for k is 1, so k + 1 = 2 for now.
|
||||
@@ -89,6 +90,12 @@ void scratch_cuda_bootstrap_low_latency_32(
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<16384>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -145,6 +152,12 @@ void scratch_cuda_bootstrap_low_latency_64(
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
case 16384:
|
||||
scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<16384>>(
|
||||
v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
|
||||
level_count, input_lwe_ciphertext_count, max_shared_memory,
|
||||
allocate_gpu_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -217,6 +230,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<16384>>(
|
||||
v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@@ -358,6 +379,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 16384:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<16384>>(
|
||||
v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
|
||||
(uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
|
||||
(double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
|
||||
polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
|
||||
max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -198,6 +198,25 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -295,8 +295,8 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
}
|
||||
|
||||
// compressed size = 8192 is actual polynomial size = 16384.
|
||||
// this size is not supported yet by any of the concrete-cuda api.
|
||||
// may be used in the future.
|
||||
// from this size, twiddles can't fit in constant memory,
|
||||
// so from here, butterfly operation access device memory.
|
||||
if constexpr (params::degree >= 8192) {
|
||||
// level 13
|
||||
tid = threadIdx.x;
|
||||
@@ -306,7 +306,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
i1 = 2 * (params::degree / 8192) * twid_id +
|
||||
(tid & (params::degree / 8192 - 1));
|
||||
i2 = i1 + params::degree / 8192;
|
||||
w = negtwiddles[twid_id + 4096];
|
||||
w = negtwiddles13[twid_id];
|
||||
u = A[i1];
|
||||
v.x = A[i2].x * w.x - A[i2].y * w.y;
|
||||
v.y = A[i2].y * w.x + A[i2].x * w.y;
|
||||
@@ -353,8 +353,9 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
// butterfly operation is started from last level
|
||||
|
||||
// compressed size = 8192 is actual polynomial size = 16384.
|
||||
// this size is not supported yet by any of the concrete-cuda api.
|
||||
// may be used in the future.
|
||||
// twiddles for this size can't fit in constant memory so
|
||||
// butterfly operation for this level acess device memory to fetch
|
||||
// twiddles
|
||||
if constexpr (params::degree >= 8192) {
|
||||
// level 13
|
||||
tid = threadIdx.x;
|
||||
@@ -364,7 +365,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
i1 = 2 * (params::degree / 8192) * twid_id +
|
||||
(tid & (params::degree / 8192 - 1));
|
||||
i2 = i1 + params::degree / 8192;
|
||||
w = negtwiddles[twid_id + 4096];
|
||||
w = negtwiddles13[twid_id];
|
||||
u.x = A[i1].x - A[i2].x;
|
||||
u.y = A[i1].y - A[i2].y;
|
||||
A[i1].x += A[i2].x;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,6 +2,13 @@
|
||||
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
|
||||
extern __constant__ double2 negtwiddles[4096];
|
||||
/*
|
||||
* 'negtwiddles' are stored in constant memory for faster access times
|
||||
* because of it's limitied size, only twiddles for up to 2^12 polynomial size
|
||||
* can be stored there, twiddles for 2^13 are stored in device memory
|
||||
* 'negtwiddles13'
|
||||
*/
|
||||
|
||||
extern __constant__ double2 negtwiddles[4096];
|
||||
extern __device__ double2 negtwiddles13[4096];
|
||||
#endif
|
||||
|
||||
@@ -10,8 +10,10 @@ constexpr int choose_opt(int degree) {
|
||||
return 8;
|
||||
else if (degree == 4096)
|
||||
return 16;
|
||||
else
|
||||
else if (degree == 8192)
|
||||
return 32;
|
||||
else
|
||||
return 64;
|
||||
}
|
||||
template <class params> class HalfDegree {
|
||||
public:
|
||||
|
||||
@@ -320,7 +320,10 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
|
||||
2, 1, 50},
|
||||
(BootstrapTestParams){881, 1, 8192, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 22, 1, 2, 1,
|
||||
2, 1, 25});
|
||||
2, 1, 25},
|
||||
(BootstrapTestParams){976, 1, 16384, 0.000007069849454709433,
|
||||
0.00000000000000029403601535432533, 11, 3, 4, 1,
|
||||
2, 1, 10});
|
||||
|
||||
std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
|
||||
BootstrapTestParams params = p.param;
|
||||
|
||||
Reference in New Issue
Block a user