mirror of
https://github.com/zama-ai/concrete.git
synced 2026-04-17 03:00:54 -04:00
fix(cuda): fix N = 8192 support
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
#define CNCRT_BSK_H
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "device.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include <atomic>
|
||||
@@ -112,32 +113,79 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
double2 *buffer;
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 1024:
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 2048:
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 4096:
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
case 8192:
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
|
||||
buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
|
||||
buffer);
|
||||
} else {
|
||||
buffer = (double2 *)cuda_malloc_async(
|
||||
shared_memory_size * total_polynomials, *stream, gpu_index);
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, NOSM>
|
||||
<<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
cudaFree(d_bsk);
|
||||
cuda_drop_async(d_bsk, *stream, gpu_index);
|
||||
cuda_drop_async(buffer, *stream, gpu_index);
|
||||
free(h_bsk);
|
||||
}
|
||||
|
||||
|
||||
@@ -718,10 +718,14 @@ __device__ void correction_inverse_fft_inplace(double2 *x) {
|
||||
* this function must be called with actual degree
|
||||
* function takes as input already compressed input
|
||||
*/
|
||||
template <class params>
|
||||
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output) {
|
||||
extern __shared__ double2 sharedMemoryFFT[];
|
||||
double2 *fft = sharedMemoryFFT;
|
||||
template <class params, sharedMemDegree SMD>
|
||||
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
|
||||
double2 *buffer) {
|
||||
double2 *fft = &buffer[blockIdx.x * params::degree / 2];
|
||||
if constexpr (SMD != NOSM) {
|
||||
extern __shared__ double2 sharedMemoryFFT[];
|
||||
fft = sharedMemoryFFT;
|
||||
}
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#include "cuComplex.h"
|
||||
|
||||
__constant__ short SW1[2048];
|
||||
__constant__ short SW2[2048];
|
||||
__constant__ short SW1[4096];
|
||||
__constant__ short SW2[4096];
|
||||
|
||||
__constant__ double2 negTwids3[4] = {
|
||||
{0.923879532511286738483136105060, 0.382683432365089781779232680492},
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
|
||||
extern __constant__ short SW1[2048];
|
||||
extern __constant__ short SW2[2048];
|
||||
extern __constant__ short SW1[4096];
|
||||
extern __constant__ short SW2[4096];
|
||||
|
||||
extern __constant__ double2 negTwids3[4];
|
||||
extern __constant__ double2 negTwids4[8];
|
||||
|
||||
Reference in New Issue
Block a user