From bfb07b961dc861e4659417e052a29b458fab15c5 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Mon, 13 Feb 2023 19:58:46 -0300
Subject: [PATCH] feat(cuda): Add support for the classical PBS for
 polynomial_size=256.

---
 src/bootstrap_amortized.cu       | 38 +++++++++++++++++++++++---------
 src/bootstrap_low_latency.cu     | 38 +++++++++++++++++++++++---------
 src/crypto/bootstrapping_key.cuh | 19 ++++++++++++++++
 3 files changed, 75 insertions(+), 20 deletions(-)

diff --git a/src/bootstrap_amortized.cu b/src/bootstrap_amortized.cu
index 61e94ca94..79da3e8f9 100644
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -13,13 +13,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
 
   assert(
       ("Error (GPU amortized PBS): base log should be <= 32", base_log <= 32));
-  assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
-          "1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert(
+      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
+       "1024, 2048, 4096, 8192",
+       polynomial_size == 256 || polynomial_size == 512 ||
+           polynomial_size == 1024 || polynomial_size == 2048 ||
+           polynomial_size == 4096 || polynomial_size == 8192));
 
   switch (polynomial_size) {
+  case 256:
+    host_bootstrap_amortized<uint32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
+    break;
   case 512:
     host_bootstrap_amortized<uint32_t, Degree<512>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
@@ -139,13 +148,22 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
 
   assert(
       ("Error (GPU amortized PBS): base log should be <= 64", base_log <= 64));
-  assert(("Error (GPU amortized PBS): polynomial size should be one of 512, "
-          "1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert(
+      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
+       "1024, 2048, 4096, 8192",
+       polynomial_size == 256 || polynomial_size == 512 ||
+           polynomial_size == 1024 || polynomial_size == 2048 ||
+           polynomial_size == 4096 || polynomial_size == 8192));
 
   switch (polynomial_size) {
+  case 256:
+    host_bootstrap_amortized<uint64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        lwe_idx, max_shared_memory);
+    break;
   case 512:
     host_bootstrap_amortized<uint64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu
index 662f29779..e9e16bf8e 100644
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -16,11 +16,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
 
   assert(("Error (GPU low latency PBS): base log should be <= 32",
           base_log <= 32));
-  assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
-          "1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert((
+      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
+      "1024, 2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
   // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
   // factor 4 being related to the occupancy of 50%). The only supported
   // value for k is 1, so k + 1 = 2 for now.
@@ -33,6 +34,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
           num_samples <= number_of_sm * 4. / 2. / level_count));
 
   switch (polynomial_size) {
+  case 256:
+    host_bootstrap_low_latency<uint32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
+    break;
   case 512:
     host_bootstrap_low_latency<uint32_t, Degree<512>>(
         v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
@@ -163,11 +172,12 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
 
   assert(("Error (GPU low latency PBS): base log should be <= 64",
           base_log <= 64));
-  assert(("Error (GPU low latency PBS): polynomial size should be one of 512, "
-          "1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert((
+      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
+      "1024, 2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
   // The number of samples should be lower than 4 * SM/((k + 1) * l) (the
   // factor 4 being related to the occupancy of 50%). The only supported
   // value for k is 1, so k + 1 = 2 for now.
@@ -180,6 +190,14 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
           num_samples <= number_of_sm * 4. / 2. / level_count));
 
   switch (polynomial_size) {
+  case 256:
+    host_bootstrap_low_latency<uint64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
+        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
+        (double2 *)bootstrapping_key, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
+        max_shared_memory);
+    break;
   case 512:
     host_bootstrap_low_latency<uint64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh
index 4ed497e4b..51d5f247a 100644
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -84,6 +84,25 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
 
   double2 *buffer;
   switch (polynomial_size) {
+  case 256:
+    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+      check_cuda_error(cudaFuncSetAttribute(
+          batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
+      check_cuda_error(cudaFuncSetCacheConfig(
+          batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>,
+          cudaFuncCachePreferShared));
+      batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, FULLSM>
+          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
+                                                                 buffer);
+    } else {
+      buffer = (double2 *)cuda_malloc_async(
+          shared_memory_size * total_polynomials, stream, gpu_index);
+      batch_NSMFFT<FFTDegree<Degree<256>, ForwardFFT>, NOSM>
+          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
+    }
+    break;
   case 512:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);