feat(cuda): Add support for the classical PBS for polynomial_size=256.

2026-02-08 11:35:02 -05:00 · 2023-02-13 19:58:46 -03:00
parent 730274f156
commit bfb07b961d
3 changed files with 75 additions and 20 deletions
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -13,13 +13,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(

  assert(
      ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
-  assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
-          "1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert(
+      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
+       "1024, 2048, 4096, 8192",
+       polynomial_size == 256 || polynomial_size == 512 ||
+           polynomial_size == 1024 || polynomial_size == 2048 ||
+           polynomial_size == 4096 || polynomial_size == 8192));

  switch (polynomial_size) {
+  case 256:
+    host_bootstrap_amortized<uint32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
+    break;
  case 512:
    host_bootstrap_amortized<uint32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
@@ -139,13 +148,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(

  assert(
      ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
-  assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
-          "1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert(
+      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
+       "1024, 2048, 4096, 8192",
+       polynomial_size == 256 || polynomial_size == 512 ||
+           polynomial_size == 1024 || polynomial_size == 2048 ||
+           polynomial_size == 4096 || polynomial_size == 8192));

  switch (polynomial_size) {
+  case 256:
+    host_bootstrap_amortized<uint64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
+    break;
  case 512:
    host_bootstrap_amortized<uint64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -16,11 +16,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(

  assert(("Error (GPU low latency PBS): base log should be <= 32",
          base_log <= 32));
-  assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
-          "1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert((
+      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
+      "1024, 2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
  // factor 4 being related to the occupancy of 50%). The only supported
  // value for k is 1, so k + 1 = 2 for now.
@@ -33,6 +34,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
          num_samples <= number_of_sm * 4. / 2. / level_count));

  switch (polynomial_size) {
+  case 256:
+    host_bootstrap_low_latency<uint32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
+    break;
  case 512:
    host_bootstrap_low_latency<uint32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
@@ -163,11 +172,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(

  assert(("Error (GPU low latency PBS): base log should be <= 64",
          base_log <= 64));
-  assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
-          "1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert((
+      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
+      "1024, 2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
  // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
  // factor 4 being related to the occupancy of 50%). The only supported
  // value for k is 1, so k + 1 = 2 for now.
@@ -180,6 +190,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
          num_samples <= number_of_sm * 4. / 2. / level_count));

  switch (polynomial_size) {
+  case 256:
+    host_bootstrap_low_latency<uint64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
+    break;
  case 512:
    host_bootstrap_low_latency<uint64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -84,6 +84,25 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,

  double2 *buffer;
  switch (polynomial_size) {
+  case 256:
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
+                                                                 buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
+    }
+    break;
  case 512:
    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);