From 184d45338758d77406ccf709a765f7945d5eaf91 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Wed, 22 Feb 2023 11:13:38 -0300
Subject: [PATCH] refactor(cuda): Implements support to N=256 in the cmux tree,
 bit extraction, and cbs.

---
 src/bit_extraction.cu    | 48 +++++++++++++++++++++------
 src/circuit_bootstrap.cu | 46 +++++++++++++++++++++-----
 src/vertical_packing.cu  | 60 ++++++++++++++++++++++++++++------
 src/wop_bootstrap.cu     | 70 ++++++++++++++++++++++++++++++++++------
 4 files changed, 187 insertions(+), 37 deletions(-)

diff --git a/src/bit_extraction.cu b/src/bit_extraction.cu
index ac7a8f427..a53b4841e 100644
--- a/src/bit_extraction.cu
+++ b/src/bit_extraction.cu
@@ -12,6 +12,12 @@ void scratch_cuda_extract_bits_32(
     bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
   case 512:
     scratch_extract_bits<uint32_t, int32_t, Degree<512>>(
         v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
@@ -59,6 +65,12 @@ void scratch_cuda_extract_bits_64(
     bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_extract_bits<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
   case 512:
     scratch_extract_bits<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
@@ -110,10 +122,10 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
   assert(("Error (GPU extract bits): base log should be <= 32",
           base_log_bsk <= 32));
   assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
-              lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
-              lwe_dimension_in == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          lwe_dimension_in == 256 || lwe_dimension_in == 512 ||
+              lwe_dimension_in == 1024 || lwe_dimension_in == 2048 ||
+              lwe_dimension_in == 4096 || lwe_dimension_in == 8192));
   assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
           "polynomial_size",
           lwe_dimension_in == polynomial_size));
@@ -130,6 +142,15 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
           number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
 
   switch (lwe_dimension_in) {
+  case 256:
+    host_extract_bits<uint32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
+        (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
+        (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
+        lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
+        level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
+        max_shared_memory);
+    break;
   case 512:
     host_extract_bits<uint32_t, Degree<512>>(
         v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
@@ -210,7 +231,7 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
  *  - 'ksk' keyswitch key
  *  - 'fourier_bsk'  complex compressed bsk in fourier domain
  *  - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
- * dimensions are: {512, 1024,2048, 4096, 8192}
+ * dimensions are: {256, 512, 1024,2048, 4096, 8192}
  *  - 'lwe_dimension_out' output LWE ciphertext dimension
  *  - 'glwe_dimension' GLWE dimension,  only glwe_dimension = 1 is supported
  * for now
@@ -238,10 +259,10 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
   assert(("Error (GPU extract bits): base log should be <= 64",
           base_log_bsk <= 64));
   assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
-              lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
-              lwe_dimension_in == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          lwe_dimension_in == 256 || lwe_dimension_in == 512 ||
+              lwe_dimension_in == 1024 || lwe_dimension_in == 2048 ||
+              lwe_dimension_in == 4096 || lwe_dimension_in == 8192));
   assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
           "polynomial_size",
           lwe_dimension_in == polynomial_size));
@@ -258,6 +279,15 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
           number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));
 
   switch (lwe_dimension_in) {
+  case 256:
+    host_extract_bits<uint64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
+        (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
+        (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
+        lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
+        level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
+        max_shared_memory);
+    break;
   case 512:
     host_extract_bits<uint64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
diff --git a/src/circuit_bootstrap.cu b/src/circuit_bootstrap.cu
index 51256b2cd..55236f84c 100644
--- a/src/circuit_bootstrap.cu
+++ b/src/circuit_bootstrap.cu
@@ -13,6 +13,12 @@ void scratch_cuda_circuit_bootstrap_32(
     uint32_t max_shared_memory, bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
   case 512:
     scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
         v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
@@ -60,6 +66,12 @@ void scratch_cuda_circuit_bootstrap_64(
     uint32_t max_shared_memory, bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
   case 512:
     scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
@@ -108,10 +120,10 @@ void cuda_circuit_bootstrap_32(
     uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
     uint32_t max_shared_memory) {
   assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
   // The number of samples should be lower than the number of streaming
   // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
   // to the occupancy of 50%). The only supported value for k is 1, so
@@ -124,6 +136,15 @@ void cuda_circuit_bootstrap_32(
           "level_count_bsk",
           number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
   switch (polynomial_size) {
+  case 256:
+    host_circuit_bootstrap<uint32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
+        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
+        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
+        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
+        max_shared_memory);
+    break;
   case 512:
     host_circuit_bootstrap<uint32_t, Degree<512>>(
         v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
@@ -208,10 +229,10 @@ void cuda_circuit_bootstrap_64(
     uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
     uint32_t max_shared_memory) {
   assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
   // The number of samples should be lower than the number of streaming
   // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
   // to the occupancy of 50%). The only supported value for k is 1, so
@@ -225,6 +246,15 @@ void cuda_circuit_bootstrap_64(
           number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
   // The number of samples should be lower than the number of streaming
   switch (polynomial_size) {
+  case 256:
+    host_circuit_bootstrap<uint64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
+        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
+        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
+        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
+        max_shared_memory);
+    break;
   case 512:
     host_circuit_bootstrap<uint64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
diff --git a/src/vertical_packing.cu b/src/vertical_packing.cu
index 945343373..eb6a437dd 100644
--- a/src/vertical_packing.cu
+++ b/src/vertical_packing.cu
@@ -16,6 +16,11 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
                                bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_cmux_tree<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
+        level_count, r, tau, max_shared_memory, allocate_gpu_memory);
+    break;
   case 512:
     scratch_cmux_tree<uint32_t, int32_t, Degree<512>>(
         v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
@@ -59,6 +64,11 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
                                uint32_t max_shared_memory,
                                bool allocate_gpu_memory) {
   switch (polynomial_size) {
+  case 256:
+    scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
+        level_count, r, tau, max_shared_memory, allocate_gpu_memory);
+    break;
   case 512:
     scratch_cmux_tree<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
@@ -101,17 +111,24 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                        uint32_t max_shared_memory) {
 
   assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32));
-  assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
-          "2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert((
+      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
+      "2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
   // For larger k we will need to adjust the mask size
   assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
           "be >= 1 ",
           r >= 1));
 
   switch (polynomial_size) {
+  case 256:
+    host_cmux_tree<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
+        (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
+        polynomial_size, base_log, level_count, r, tau, max_shared_memory);
+    break;
   case 512:
     host_cmux_tree<uint32_t, int32_t, Degree<512>>(
         v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
@@ -182,17 +199,24 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                        uint32_t max_shared_memory) {
 
   assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64));
-  assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
-          "2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert((
+      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
+      "2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
   // For larger k we will need to adjust the mask size
   assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
           "be >= 1 ",
           r >= 1));
 
   switch (polynomial_size) {
+  case 256:
+    host_cmux_tree<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
+        (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
+        polynomial_size, base_log, level_count, r, tau, max_shared_memory);
+    break;
   case 512:
     host_cmux_tree<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
@@ -251,6 +275,11 @@ void scratch_cuda_blind_rotation_sample_extraction_32(
     bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
+        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
+    break;
   case 512:
     scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<512>>(
         v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
@@ -293,6 +322,11 @@ void scratch_cuda_blind_rotation_sample_extraction_64(
     bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
+        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
+    break;
   case 512:
     scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
@@ -353,6 +387,12 @@ void cuda_blind_rotate_and_sample_extraction_64(
     uint32_t l_gadget, uint32_t max_shared_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
+        (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
+        polynomial_size, base_log, l_gadget, max_shared_memory);
+    break;
   case 512:
     host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
diff --git a/src/wop_bootstrap.cu b/src/wop_bootstrap.cu
index ae870a522..069c767e6 100644
--- a/src/wop_bootstrap.cu
+++ b/src/wop_bootstrap.cu
@@ -14,6 +14,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32(
     bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
+        lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau,
+        max_shared_memory, allocate_gpu_memory);
+    break;
   case 512:
     scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<512>>(
         v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
@@ -63,6 +69,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64(
     bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
+  case 256:
+    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
+        lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau,
+        max_shared_memory, allocate_gpu_memory);
+    break;
   case 512:
     scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
@@ -113,6 +125,14 @@ void scratch_cuda_wop_pbs_32(
     uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
     uint32_t max_shared_memory, bool allocate_gpu_memory) {
   switch (polynomial_size) {
+  case 256:
+    scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
+        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
+        level_count_bsk, number_of_bits_of_message_including_padding,
+        number_of_bits_to_extract, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
   case 512:
     scratch_wop_pbs<uint32_t, int32_t, Degree<512>>(
         v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
@@ -173,6 +193,14 @@ void scratch_cuda_wop_pbs_64(
     uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
     uint32_t max_shared_memory, bool allocate_gpu_memory) {
   switch (polynomial_size) {
+  case 256:
+    scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
+        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
+        level_count_bsk, number_of_bits_of_message_including_padding,
+        number_of_bits_to_extract, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
   case 512:
     scratch_wop_pbs<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
@@ -232,7 +260,7 @@ void scratch_cuda_wop_pbs_64(
  *  - 'lut_vector' list of test vectors
  *  - 'cbs_vp_buffer' a pre-allocated array to store intermediate results
  *  - 'polynomial_size' size of the test polynomial, supported sizes:
- * {512, 1024, 2048, 4096, 8192}
+ * {256, 512, 1024, 2048, 4096, 8192}
  *  - 'glwe_dimension' supported dimensions: {1}
  *  - 'lwe_dimension' dimension of input LWE ciphertexts
  *  - 'level_count_bsk' decomposition level for bootstrapping
@@ -255,10 +283,10 @@ void cuda_circuit_bootstrap_vertical_packing_64(
     uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
     uint32_t max_shared_memory) {
   assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
   // The number of inputs should be lower than the number of streaming
   // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
   // to the occupancy of 50%). The only supported value for k is 1, so
@@ -271,6 +299,16 @@ void cuda_circuit_bootstrap_vertical_packing_64(
           "level_count_bsk",
           number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
   switch (polynomial_size) {
+  case 256:
+    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
+        (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
+        cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
+        base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
+        base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
+        max_shared_memory);
+    break;
   case 512:
     host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out,
@@ -343,7 +381,7 @@ void cuda_circuit_bootstrap_vertical_packing_64(
  *  - 'glwe_dimension' supported dimensions: {1}
  *  - 'lwe_dimension' dimension of input lwe ciphertexts
  *  - 'polynomial_size' size of the test polynomial, supported sizes:
- * {512, 1024, 2048, 4096, 8192}
+ * {256, 512, 1024, 2048, 4096, 8192}
  *  - 'base_log_bsk'  base log parameter for bootstrapping
  *  - 'level_count_bsk' decomposition level for bootstrapping
  *  - 'base_log_ksk' base log parameter for keyswitch
@@ -374,10 +412,10 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
                      uint32_t number_of_bits_to_extract, uint32_t delta_log,
                      uint32_t number_of_inputs, uint32_t max_shared_memory) {
   assert(("Error (GPU WOP PBS): polynomial_size should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
   // The number of inputs should be lower than the number of streaming
   // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
   // to the occupancy of 50%). The only supported value for k is 1, so
@@ -390,6 +428,18 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
           "level_count_bsk",
           number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
   switch (polynomial_size) {
+  case 256:
+    host_wop_pbs<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
+        (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
+        wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
+        level_count_cbs, number_of_bits_of_message_including_padding,
+        number_of_bits_to_extract, delta_log, number_of_inputs,
+        max_shared_memory);
+    break;
   case 512:
     host_wop_pbs<uint64_t, int64_t, Degree<512>>(
         v_stream, gpu_index, (uint64_t *)lwe_array_out,