fix(cuda): fix N = 8192 support

2026-04-17 03:00:54 -04:00 · 2022-11-22 14:18:45 +01:00
parent 68866766a4
commit 921c0a6306
4 changed files with 71 additions and 19 deletions
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -2,6 +2,7 @@
 #define CNCRT_BSK_H

 #include "bootstrap.h"
+#include "device.h"
 #include "polynomial/parameters.cuh"
 #include "polynomial/polynomial.cuh"
 #include <atomic>
@@ -112,32 +113,79 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
  cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);

  auto stream = static_cast<cudaStream_t *>(v_stream);
+  double2 *buffer;
  switch (polynomial_size) {
  case 512:
-    batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
-        <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
+                                                                 buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
+    }
    break;
  case 1024:
-    batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
-        <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
+                                                                 buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
+    }
    break;
  case 2048:
-    batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
-        <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
+                                                                 buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
+    }
    break;
  case 4096:
-    batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
-        <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
+                                                                 buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
+    }
    break;
  case 8192:
-    batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
-        <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
+                                                                 buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, *stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
+    }
    break;
  default:
    break;
  }

-  cudaFree(d_bsk);
+  cuda_drop_async(d_bsk, *stream, gpu_index);
+  cuda_drop_async(buffer, *stream, gpu_index);
  free(h_bsk);
 }

--- a/src/fft/bnsmfft.cuh
+++ b/src/fft/bnsmfft.cuh
@@ -718,10 +718,14 @@ __device__ void correction_inverse_fft_inplace(double2 *x) {
 * this function must be called with actual degree
 * function takes as input already compressed input
 */
-template <class params>
-__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output) {
-  extern __shared__ double2 sharedMemoryFFT[];
-  double2 *fft = sharedMemoryFFT;
+template <class params, sharedMemDegree SMD>
+__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
+                             double2 *buffer) {
+  double2 *fft = &buffer[blockIdx.x * params::degree / 2];
+  if constexpr (SMD != NOSM) {
+    extern __shared__ double2 sharedMemoryFFT[];
+    fft = sharedMemoryFFT;
+  }

  int tid = threadIdx.x;

--- a/src/fft/twiddles.cu
+++ b/src/fft/twiddles.cu
@@ -1,7 +1,7 @@
 #include "cuComplex.h"

-__constant__ short SW1[2048];
-__constant__ short SW2[2048];
+__constant__ short SW1[4096];
+__constant__ short SW2[4096];

 __constant__ double2 negTwids3[4] = {
    {0.923879532511286738483136105060, 0.382683432365089781779232680492},
--- a/src/fft/twiddles.cuh
+++ b/src/fft/twiddles.cuh
@@ -2,8 +2,8 @@
 #ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
 #define GPU_BOOTSTRAP_TWIDDLES_CUH

-extern __constant__ short SW1[2048];
-extern __constant__ short SW2[2048];
+extern __constant__ short SW1[4096];
+extern __constant__ short SW2[4096];

 extern __constant__ double2 negTwids3[4];
 extern __constant__ double2 negTwids4[8];