fix(cuda): fix N = 8192 support

This commit is contained in:
Agnes Leroy
2022-11-22 14:18:45 +01:00
committed by Agnès Leroy
parent 68866766a4
commit 921c0a6306
4 changed files with 71 additions and 19 deletions

View File

@@ -2,6 +2,7 @@
#define CNCRT_BSK_H
#include "bootstrap.h"
#include "device.h"
#include "polynomial/parameters.cuh"
#include "polynomial/polynomial.cuh"
#include <atomic>
@@ -112,32 +113,79 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);
auto stream = static_cast<cudaStream_t *>(v_stream);
double2 *buffer;
switch (polynomial_size) {
case 512:
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 1024:
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 2048:
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 4096:
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
case 8192:
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
buffer);
} else {
buffer = (double2 *)cuda_malloc_async(
shared_memory_size * total_polynomials, *stream, gpu_index);
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, NOSM>
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
}
break;
default:
break;
}
cudaFree(d_bsk);
cuda_drop_async(d_bsk, *stream, gpu_index);
cuda_drop_async(buffer, *stream, gpu_index);
free(h_bsk);
}

View File

@@ -718,10 +718,14 @@ __device__ void correction_inverse_fft_inplace(double2 *x) {
* this function must be called with actual degree
* function takes as input already compressed input
*/
template <class params>
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output) {
extern __shared__ double2 sharedMemoryFFT[];
double2 *fft = sharedMemoryFFT;
template <class params, sharedMemDegree SMD>
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
double2 *buffer) {
double2 *fft = &buffer[blockIdx.x * params::degree / 2];
if constexpr (SMD != NOSM) {
extern __shared__ double2 sharedMemoryFFT[];
fft = sharedMemoryFFT;
}
int tid = threadIdx.x;

View File

@@ -1,7 +1,7 @@
#include "cuComplex.h"
__constant__ short SW1[2048];
__constant__ short SW2[2048];
__constant__ short SW1[4096];
__constant__ short SW2[4096];
__constant__ double2 negTwids3[4] = {
{0.923879532511286738483136105060, 0.382683432365089781779232680492},

View File

@@ -2,8 +2,8 @@
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
#define GPU_BOOTSTRAP_TWIDDLES_CUH
extern __constant__ short SW1[2048];
extern __constant__ short SW2[2048];
extern __constant__ short SW1[4096];
extern __constant__ short SW2[4096];
extern __constant__ double2 negTwids3[4];
extern __constant__ double2 negTwids4[8];