mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-10 07:08:03 -05:00
fix(gpu): remove all resettings of shared memory size
This commit is contained in:
@@ -247,14 +247,5 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock,
|
||||
gpu_index);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
#if CUDA_ARCH == 900
|
||||
max_shared_memory = 226000;
|
||||
#elif CUDA_ARCH == 890
|
||||
max_shared_memory = 127000;
|
||||
#elif CUDA_ARCH == 800
|
||||
max_shared_memory = 163000;
|
||||
#elif CUDA_ARCH == 700
|
||||
max_shared_memory = 95000;
|
||||
#endif
|
||||
return max_shared_memory;
|
||||
}
|
||||
|
||||
@@ -116,12 +116,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
switch (polynomial_size) {
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
@@ -134,12 +128,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
break;
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
@@ -152,12 +140,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
break;
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
@@ -170,12 +152,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
break;
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
@@ -188,12 +164,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
break;
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
@@ -206,12 +176,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
break;
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
@@ -224,12 +188,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
|
||||
break;
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
@@ -270,14 +228,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
case 256:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
@@ -291,14 +241,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
case 512:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
@@ -312,14 +254,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
case 1024:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
@@ -333,14 +267,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
case 2048:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
@@ -354,14 +280,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
case 4096:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
@@ -375,14 +293,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
case 8192:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
output, buffer);
|
||||
@@ -396,14 +306,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
|
||||
case 16384:
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
|
||||
FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
|
||||
|
||||
@@ -258,28 +258,6 @@ __host__ void scratch_programmable_bootstrap_amortized(
|
||||
uint32_t glwe_dimension, uint32_t polynomial_size,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size, glwe_dimension);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm);
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
if (allocate_gpu_memory) {
|
||||
uint64_t buffer_size =
|
||||
get_buffer_size_programmable_bootstrap_amortized<Torus>(
|
||||
|
||||
@@ -194,30 +194,6 @@ __host__ void scratch_programmable_bootstrap_cg(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t full_sm =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_cg<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);
|
||||
|
||||
@@ -213,69 +213,6 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_cg_accumulate =
|
||||
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_cg_accumulate =
|
||||
get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (max_shared_memory < partial_sm_cg_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory < full_sm_cg_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_cg_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
|
||||
@@ -309,55 +309,6 @@ __host__ void scratch_programmable_bootstrap(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
uint64_t full_sm_step_one =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_step_two =
|
||||
get_buffer_size_full_sm_programmable_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
// Configure step one
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_one<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
// Configure step two
|
||||
if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_step_two<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_step_two<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory);
|
||||
|
||||
@@ -390,97 +390,6 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_one =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_accumulate_step_two =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_accumulate_step_one =
|
||||
get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
|
||||
Torus>(polynomial_size);
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (max_shared_memory < partial_sm_accumulate_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory < full_sm_accumulate_step_one) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm_accumulate_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_accumulate_step_one));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_one<
|
||||
Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (max_shared_memory < full_sm_accumulate_step_two) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_two<
|
||||
Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_two<
|
||||
Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_two<
|
||||
Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_accumulate_step_two));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_accumulate_step_two<
|
||||
Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<Torus, MULTI_BIT>(
|
||||
|
||||
@@ -200,49 +200,6 @@ __host__ void scratch_programmable_bootstrap_tbc(
|
||||
uint32_t polynomial_size, uint32_t level_count,
|
||||
uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm =
|
||||
get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc = 0;
|
||||
if (supports_dsm)
|
||||
minimum_sm_tbc =
|
||||
get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
if (max_shared_memory >= full_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory >= partial_sm + minimum_sm_tbc) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm + minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_programmable_bootstrap_tbc<Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
*buffer = new pbs_buffer<Torus, CLASSICAL>(
|
||||
stream, gpu_index, glwe_dimension, polynomial_size, level_count,
|
||||
input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory);
|
||||
|
||||
@@ -204,84 +204,6 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
|
||||
uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
|
||||
bool allocate_gpu_memory) {
|
||||
|
||||
bool supports_dsm =
|
||||
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
|
||||
Torus>(polynomial_size);
|
||||
|
||||
uint64_t full_sm_keybundle =
|
||||
get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t full_sm_tbc_accumulate =
|
||||
get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t partial_sm_tbc_accumulate =
|
||||
get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
uint64_t minimum_sm_tbc_accumulate = 0;
|
||||
if (supports_dsm)
|
||||
minimum_sm_tbc_accumulate =
|
||||
get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
|
||||
polynomial_size);
|
||||
|
||||
int max_shared_memory = cuda_get_max_shared_memory(0);
|
||||
|
||||
if (max_shared_memory < full_sm_keybundle) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_keybundle<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
if (max_shared_memory <
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
NOSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else if (max_shared_memory <
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
} else {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
full_sm_tbc_accumulate + minimum_sm_tbc_accumulate));
|
||||
cudaFuncSetCacheConfig(
|
||||
device_multi_bit_programmable_bootstrap_tbc_accumulate<Torus, params,
|
||||
FULLSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
check_cuda_error(cudaGetLastError());
|
||||
}
|
||||
|
||||
auto lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
|
||||
gpu_index, input_lwe_ciphertext_count, polynomial_size);
|
||||
*buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
|
||||
|
||||
Reference in New Issue
Block a user