feat(concrete-cuda): Added classical pbs support for 2^14 polynomial size, added corresponding tests.

2026-02-08 03:25:05 -05:00 · 2023-03-20 14:48:13 +04:00
parent 82ab395f71
commit b5839a3993
8 changed files with 4199 additions and 13 deletions
--- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
@@ -6,10 +6,11 @@
 void checks_fast_bootstrap_amortized(int polynomial_size) {
  assert(
      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
-       "1024, 2048, 4096, 8192",
+       "1024, 2048, 4096, 8192, 16384",
       polynomial_size == 256 || polynomial_size == 512 ||
           polynomial_size == 1024 || polynomial_size == 2048 ||
-           polynomial_size == 4096 || polynomial_size == 8192));
+           polynomial_size == 4096 || polynomial_size == 8192 ||
+           polynomial_size == 16384));
 }

 /*
@@ -67,6 +68,11 @@ void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
    break;
+  case 16384:
+    scratch_bootstrap_amortized<uint32_t, int32_t, Degree<16384>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
  default:
    break;
  }
@@ -118,6 +124,11 @@ void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
    break;
+  case 16384:
+    scratch_bootstrap_amortized<uint64_t, int64_t, Degree<16384>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+    break;
  default:
    break;
  }
@@ -185,6 +196,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
+  case 16384:
+    host_bootstrap_amortized<uint32_t, Degree<16384>>(
+        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
+    break;
  default:
    break;
  }
@@ -314,6 +333,14 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        lwe_idx, max_shared_memory);
    break;
+  case 16384:
+    host_bootstrap_amortized<uint64_t, Degree<16384>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
+    break;
  default:
    break;
  }
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
@@ -8,10 +8,11 @@ void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,

  assert((
      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
-      "1024, 2048, 4096, 8192",
+      "1024, 2048, 4096, 8192, 16384",
      polynomial_size == 256 || polynomial_size == 512 ||
          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
+          polynomial_size == 4096 || polynomial_size == 8192 ||
+          polynomial_size == 16384));
  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
  // factor 4 being related to the occupancy of 50%). The only supported
  // value for k is 1, so k + 1 = 2 for now.
@@ -89,6 +90,12 @@ void scratch_cuda_bootstrap_low_latency_32(
        level_count, input_lwe_ciphertext_count, max_shared_memory,
        allocate_gpu_memory);
    break;
+  case 16384:
+    scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<16384>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, max_shared_memory,
+        allocate_gpu_memory);
+    break;
  default:
    break;
  }
@@ -145,6 +152,12 @@ void scratch_cuda_bootstrap_low_latency_64(
        level_count, input_lwe_ciphertext_count, max_shared_memory,
        allocate_gpu_memory);
    break;
+  case 16384:
+    scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<16384>>(
+        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
+        level_count, input_lwe_ciphertext_count, max_shared_memory,
+        allocate_gpu_memory);
+    break;
  default:
    break;
  }
@@ -217,6 +230,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        max_shared_memory);
    break;
+  case 16384:
+    host_bootstrap_low_latency<uint32_t, Degree<16384>>(
+        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
+    break;
  default:
    break;
  }
@@ -358,6 +379,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
        max_shared_memory);
    break;
+  case 16384:
+    host_bootstrap_low_latency<uint64_t, Degree<16384>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
+    break;
  default:
    break;
  }
--- a/backends/concrete-cuda/implementation/src/crypto/bootstrapping_key.cuh
+++ b/backends/concrete-cuda/implementation/src/crypto/bootstrapping_key.cuh
@@ -198,6 +198,25 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
    }
    break;
+  case 16384:
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
+                                                                 buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<16384>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
+    }
+    break;
  default:
    break;
  }
--- a/backends/concrete-cuda/implementation/src/fft/bnsmfft.cuh
+++ b/backends/concrete-cuda/implementation/src/fft/bnsmfft.cuh
@@ -295,8 +295,8 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
  }

  // compressed size = 8192 is actual polynomial size = 16384.
-  // this size is not supported yet by any of the concrete-cuda api.
-  // may be used in the future.
+  // from this size, twiddles can't fit in constant memory,
+  // so from here, butterfly operation access device memory.
  if constexpr (params::degree >= 8192) {
    // level 13
    tid = threadIdx.x;
@@ -306,7 +306,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
      i1 = 2 * (params::degree / 8192) * twid_id +
           (tid & (params::degree / 8192 - 1));
      i2 = i1 + params::degree / 8192;
-      w = negtwiddles[twid_id + 4096];
+      w = negtwiddles13[twid_id];
      u = A[i1];
      v.x = A[i2].x * w.x - A[i2].y * w.y;
      v.y = A[i2].y * w.x + A[i2].x * w.y;
@@ -353,8 +353,9 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
  // butterfly operation is started from last level

  // compressed size = 8192 is actual polynomial size = 16384.
-  // this size is not supported yet by any of the concrete-cuda api.
-  // may be used in the future.
+  // twiddles for this size can't fit in constant memory so
+  // butterfly operation for this level acess device memory to fetch
+  // twiddles
  if constexpr (params::degree >= 8192) {
    // level 13
    tid = threadIdx.x;
@@ -364,7 +365,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
      i1 = 2 * (params::degree / 8192) * twid_id +
           (tid & (params::degree / 8192 - 1));
      i2 = i1 + params::degree / 8192;
-      w = negtwiddles[twid_id + 4096];
+      w = negtwiddles13[twid_id];
      u.x = A[i1].x - A[i2].x;
      u.y = A[i1].y - A[i2].y;
      A[i1].x += A[i2].x;
--- a/backends/concrete-cuda/implementation/src/fft/twiddles.cu
+++ b/backends/concrete-cuda/implementation/src/fft/twiddles.cu
--- a/backends/concrete-cuda/implementation/src/fft/twiddles.cuh
+++ b/backends/concrete-cuda/implementation/src/fft/twiddles.cuh
@@ -2,6 +2,13 @@
 #ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
 #define GPU_BOOTSTRAP_TWIDDLES_CUH

-extern __constant__ double2 negtwiddles[4096];
+/*
+ * 'negtwiddles' are stored in constant memory for faster access times
+ * because of it's limitied size, only twiddles for up to 2^12 polynomial size
+ * can be stored there, twiddles for 2^13 are stored in device memory
+ * 'negtwiddles13'
+ */

+extern __constant__ double2 negtwiddles[4096];
+extern __device__ double2 negtwiddles13[4096];
 #endif
--- a/backends/concrete-cuda/implementation/src/polynomial/parameters.cuh
+++ b/backends/concrete-cuda/implementation/src/polynomial/parameters.cuh
@@ -10,8 +10,10 @@ constexpr int choose_opt(int degree) {
    return 8;
  else if (degree == 4096)
    return 16;
-  else
+  else if (degree == 8192)
    return 32;
+  else
+    return 64;
 }
 template <class params> class HalfDegree {
 public:
--- a/backends/concrete-cuda/implementation/test/test_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test/test_bootstrap.cpp
@@ -320,7 +320,10 @@ TEST_P(BootstrapTestPrimitives_u64, low_latency_bootstrap) {
                              2, 1, 50},
        (BootstrapTestParams){881, 1, 8192, 0.000007069849454709433,
                              0.00000000000000029403601535432533, 22, 1, 2, 1,
-                              2, 1, 25});
+                              2, 1, 25},
+        (BootstrapTestParams){976, 1, 16384, 0.000007069849454709433,
+                              0.00000000000000029403601535432533, 11, 3, 4, 1,
+                              2, 1, 10});

 std::string printParamName(::testing::TestParamInfo<BootstrapTestParams> p) {
  BootstrapTestParams params = p.param;