refactor(cuda): Implements support to N=256 in the cmux tree, bit

extraction, and cbs.
2026-02-08 11:35:02 -05:00 · 2023-02-22 11:13:38 -03:00
parent 75e9baae78
commit 184d453387
4 changed files with 187 additions and 37 deletions
--- a/src/bit_extraction.cu
+++ b/src/bit_extraction.cu
@@ -12,6 +12,12 @@ void scratch_cuda_extract_bits_32(
    bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
  case 512:
    scratch_extract_bits<uint32_t, int32_t, Degree<512>>(
        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
@@ -59,6 +65,12 @@ void scratch_cuda_extract_bits_64(
    bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_extract_bits<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
  case 512:
    scratch_extract_bits<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
@@ -110,10 +122,10 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
  assert(("Error (GPU extract bits): base log should be <= 32",
          base_log_bsk <= 32));
  assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
-              lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
-              lwe_dimension_in == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          lwe_dimension_in == 256 || lwe_dimension_in == 512 ||
+              lwe_dimension_in == 1024 || lwe_dimension_in == 2048 ||
+              lwe_dimension_in == 4096 || lwe_dimension_in == 8192));
  assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
          "polynomial_size",
          lwe_dimension_in == polynomial_size));
@@ -130,6 +142,15 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));

  switch (lwe_dimension_in) {
+  case 256:
+    host_extract_bits<uint32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
+        (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
+        (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
+        lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
+        level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
+        max_shared_memory);
+    break;
  case 512:
    host_extract_bits<uint32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
@@ -210,7 +231,7 @@ void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
 *  - 'ksk' keyswitch key
 *  - 'fourier_bsk'  complex compressed bsk in fourier domain
 *  - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
- * dimensions are: {512, 1024,2048, 4096, 8192}
+ * dimensions are: {256, 512, 1024,2048, 4096, 8192}
 *  - 'lwe_dimension_out' output LWE ciphertext dimension
 *  - 'glwe_dimension' GLWE dimension,  only glwe_dimension = 1 is supported
 * for now
@@ -238,10 +259,10 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
  assert(("Error (GPU extract bits): base log should be <= 64",
          base_log_bsk <= 64));
  assert(("Error (GPU extract bits): lwe_dimension_in should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          lwe_dimension_in == 512 || lwe_dimension_in == 1024 ||
-              lwe_dimension_in == 2048 || lwe_dimension_in == 4096 ||
-              lwe_dimension_in == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          lwe_dimension_in == 256 || lwe_dimension_in == 512 ||
+              lwe_dimension_in == 1024 || lwe_dimension_in == 2048 ||
+              lwe_dimension_in == 4096 || lwe_dimension_in == 8192));
  assert(("Error (GPU extract bits): lwe_dimension_in should be equal to "
          "polynomial_size",
          lwe_dimension_in == polynomial_size));
@@ -258,6 +279,15 @@ void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
          number_of_samples <= number_of_sm / 4. / 2. / level_count_bsk));

  switch (lwe_dimension_in) {
+  case 256:
+    host_extract_bits<uint64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
+        (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
+        (double2 *)fourier_bsk, number_of_bits, delta_log, lwe_dimension_in,
+        lwe_dimension_out, glwe_dimension, polynomial_size, base_log_bsk,
+        level_count_bsk, base_log_ksk, level_count_ksk, number_of_samples,
+        max_shared_memory);
+    break;
  case 512:
    host_extract_bits<uint64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
--- a/src/circuit_bootstrap.cu
+++ b/src/circuit_bootstrap.cu
@@ -13,6 +13,12 @@ void scratch_cuda_circuit_bootstrap_32(
    uint32_t max_shared_memory, bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
  case 512:
    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
@@ -60,6 +66,12 @@ void scratch_cuda_circuit_bootstrap_64(
    uint32_t max_shared_memory, bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
+        polynomial_size, level_count_cbs, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
  case 512:
    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
@@ -108,10 +120,10 @@ void cuda_circuit_bootstrap_32(
    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
    uint32_t max_shared_memory) {
  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
  // The number of samples should be lower than the number of streaming
  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
  // to the occupancy of 50%). The only supported value for k is 1, so
@@ -124,6 +136,15 @@ void cuda_circuit_bootstrap_32(
          "level_count_bsk",
          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
  switch (polynomial_size) {
+  case 256:
+    host_circuit_bootstrap<uint32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
+        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
+        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
+        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
+        max_shared_memory);
+    break;
  case 512:
    host_circuit_bootstrap<uint32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
@@ -208,10 +229,10 @@ void cuda_circuit_bootstrap_64(
    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
    uint32_t max_shared_memory) {
  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
  // The number of samples should be lower than the number of streaming
  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
  // to the occupancy of 50%). The only supported value for k is 1, so
@@ -225,6 +246,15 @@ void cuda_circuit_bootstrap_64(
          number_of_inputs <= number_of_sm / 4. / 2. / level_bsk));
  // The number of samples should be lower than the number of streaming
  switch (polynomial_size) {
+  case 256:
+    host_circuit_bootstrap<uint64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
+        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
+        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
+        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
+        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
+        max_shared_memory);
+    break;
  case 512:
    host_circuit_bootstrap<uint64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
--- a/src/vertical_packing.cu
+++ b/src/vertical_packing.cu
@@ -16,6 +16,11 @@ void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
                               bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_cmux_tree<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
+        level_count, r, tau, max_shared_memory, allocate_gpu_memory);
+    break;
  case 512:
    scratch_cmux_tree<uint32_t, int32_t, Degree<512>>(
        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
@@ -59,6 +64,11 @@ void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
                               uint32_t max_shared_memory,
                               bool allocate_gpu_memory) {
  switch (polynomial_size) {
+  case 256:
+    scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
+        level_count, r, tau, max_shared_memory, allocate_gpu_memory);
+    break;
  case 512:
    scratch_cmux_tree<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
@@ -101,17 +111,24 @@ void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                       uint32_t max_shared_memory) {

  assert(("Error (GPU Cmux tree): base log should be <= 32", base_log <= 32));
-  assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
-          "2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert((
+      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
+      "2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
  // For larger k we will need to adjust the mask size
  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
          "be >= 1 ",
          r >= 1));

  switch (polynomial_size) {
+  case 256:
+    host_cmux_tree<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
+        (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
+        polynomial_size, base_log, level_count, r, tau, max_shared_memory);
+    break;
  case 512:
    host_cmux_tree<uint32_t, int32_t, Degree<512>>(
        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
@@ -182,17 +199,24 @@ void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
                       uint32_t max_shared_memory) {

  assert(("Error (GPU Cmux tree): base log should be <= 64", base_log <= 64));
-  assert(("Error (GPU Cmux tree): polynomial size should be one of 512, 1024, "
-          "2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+  assert((
+      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
+      "2048, 4096, 8192",
+      polynomial_size == 256 || polynomial_size == 512 ||
+          polynomial_size == 1024 || polynomial_size == 2048 ||
+          polynomial_size == 4096 || polynomial_size == 8192));
  // For larger k we will need to adjust the mask size
  assert(("Error (GPU Cmux tree): r, the number of layers in the tree, should "
          "be >= 1 ",
          r >= 1));

  switch (polynomial_size) {
+  case 256:
+    host_cmux_tree<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
+        (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
+        polynomial_size, base_log, level_count, r, tau, max_shared_memory);
+    break;
  case 512:
    host_cmux_tree<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
@@ -251,6 +275,11 @@ void scratch_cuda_blind_rotation_sample_extraction_32(
    bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
+        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
+    break;
  case 512:
    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<512>>(
        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
@@ -293,6 +322,11 @@ void scratch_cuda_blind_rotation_sample_extraction_64(
    bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
+        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
+    break;
  case 512:
    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
@@ -353,6 +387,12 @@ void cuda_blind_rotate_and_sample_extraction_64(
    uint32_t l_gadget, uint32_t max_shared_memory) {

  switch (polynomial_size) {
+  case 256:
+    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
+        (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
+        polynomial_size, base_log, l_gadget, max_shared_memory);
+    break;
  case 512:
    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
--- a/src/wop_bootstrap.cu
+++ b/src/wop_bootstrap.cu
@@ -14,6 +14,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_32(
    bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
+        lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau,
+        max_shared_memory, allocate_gpu_memory);
+    break;
  case 512:
    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<512>>(
        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
@@ -63,6 +69,12 @@ void scratch_cuda_circuit_bootstrap_vertical_packing_64(
    bool allocate_gpu_memory) {

  switch (polynomial_size) {
+  case 256:
+    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
+        lwe_dimension, polynomial_size, level_count_cbs, number_of_inputs, tau,
+        max_shared_memory, allocate_gpu_memory);
+    break;
  case 512:
    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
@@ -113,6 +125,14 @@ void scratch_cuda_wop_pbs_32(
    uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
    uint32_t max_shared_memory, bool allocate_gpu_memory) {
  switch (polynomial_size) {
+  case 256:
+    scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
+        v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
+        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
+        level_count_bsk, number_of_bits_of_message_including_padding,
+        number_of_bits_to_extract, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
  case 512:
    scratch_wop_pbs<uint32_t, int32_t, Degree<512>>(
        v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
@@ -173,6 +193,14 @@ void scratch_cuda_wop_pbs_64(
    uint32_t number_of_bits_to_extract, uint32_t number_of_inputs,
    uint32_t max_shared_memory, bool allocate_gpu_memory) {
  switch (polynomial_size) {
+  case 256:
+    scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
+        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
+        level_count_bsk, number_of_bits_of_message_including_padding,
+        number_of_bits_to_extract, number_of_inputs, max_shared_memory,
+        allocate_gpu_memory);
+    break;
  case 512:
    scratch_wop_pbs<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, wop_pbs_buffer, delta_log, cbs_delta_log,
@@ -232,7 +260,7 @@ void scratch_cuda_wop_pbs_64(
 *  - 'lut_vector' list of test vectors
 *  - 'cbs_vp_buffer' a pre-allocated array to store intermediate results
 *  - 'polynomial_size' size of the test polynomial, supported sizes:
- * {512, 1024, 2048, 4096, 8192}
+ * {256, 512, 1024, 2048, 4096, 8192}
 *  - 'glwe_dimension' supported dimensions: {1}
 *  - 'lwe_dimension' dimension of input LWE ciphertexts
 *  - 'level_count_bsk' decomposition level for bootstrapping
@@ -255,10 +283,10 @@ void cuda_circuit_bootstrap_vertical_packing_64(
    uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
    uint32_t max_shared_memory) {
  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
  // The number of inputs should be lower than the number of streaming
  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
  // to the occupancy of 50%). The only supported value for k is 1, so
@@ -271,6 +299,16 @@ void cuda_circuit_bootstrap_vertical_packing_64(
          "level_count_bsk",
          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
  switch (polynomial_size) {
+  case 256:
+    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
+        (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
+        cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
+        base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
+        base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
+        max_shared_memory);
+    break;
  case 512:
    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out,
@@ -343,7 +381,7 @@ void cuda_circuit_bootstrap_vertical_packing_64(
 *  - 'glwe_dimension' supported dimensions: {1}
 *  - 'lwe_dimension' dimension of input lwe ciphertexts
 *  - 'polynomial_size' size of the test polynomial, supported sizes:
- * {512, 1024, 2048, 4096, 8192}
+ * {256, 512, 1024, 2048, 4096, 8192}
 *  - 'base_log_bsk'  base log parameter for bootstrapping
 *  - 'level_count_bsk' decomposition level for bootstrapping
 *  - 'base_log_ksk' base log parameter for keyswitch
@@ -374,10 +412,10 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
                     uint32_t number_of_bits_to_extract, uint32_t delta_log,
                     uint32_t number_of_inputs, uint32_t max_shared_memory) {
  assert(("Error (GPU WOP PBS): polynomial_size should be one of "
-          "512, 1024, 2048, 4096, 8192",
-          polynomial_size == 512 || polynomial_size == 1024 ||
-              polynomial_size == 2048 || polynomial_size == 4096 ||
-              polynomial_size == 8192));
+          "256, 512, 1024, 2048, 4096, 8192",
+          polynomial_size == 256 || polynomial_size == 512 ||
+              polynomial_size == 1024 || polynomial_size == 2048 ||
+              polynomial_size == 4096 || polynomial_size == 8192));
  // The number of inputs should be lower than the number of streaming
  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
  // to the occupancy of 50%). The only supported value for k is 1, so
@@ -390,6 +428,18 @@ void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
          "level_count_bsk",
          number_of_inputs <= number_of_sm / 4. / 2. / level_count_bsk));
  switch (polynomial_size) {
+  case 256:
+    host_wop_pbs<uint64_t, int64_t, Degree<256>>(
+        v_stream, gpu_index, (uint64_t *)lwe_array_out,
+        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
+        (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
+        wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
+        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
+        level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
+        level_count_cbs, number_of_bits_of_message_including_padding,
+        number_of_bits_to_extract, delta_log, number_of_inputs,
+        max_shared_memory);
+    break;
  case 512:
    host_wop_pbs<uint64_t, int64_t, Degree<512>>(
        v_stream, gpu_index, (uint64_t *)lwe_array_out,