fix(backend-gpu): avoid warp branching in the PBS

2026-04-17 03:00:54 -04:00 · 2023-05-15 11:20:08 +02:00
parent 577e7847c6
commit b174f81687
5 changed files with 54 additions and 173 deletions
--- a/.github/workflows/concrete_cuda_test.yml
+++ b/.github/workflows/concrete_cuda_test.yml
@@ -26,6 +26,8 @@ jobs:
      - uses: actions/checkout@v3
      - name: Set up Clang
        run: |
+          sudo sed -i 's/azure\.//' /etc/apt/sources.list
+          sudo apt-get update 
          sudo apt-get install clang-format-11
      - name: Install cmake-format
        run: |
--- a/backends/concrete-cuda/implementation/src/boolean_gates.cu
+++ b/backends/concrete-cuda/implementation/src/boolean_gates.cu
@@ -95,8 +95,8 @@ extern "C" void cuda_boolean_and_32(

  int8_t *pbs_buffer = nullptr;
  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
-      input_lwe_ciphertext_count, max_shared_memory, true);
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
      lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -203,8 +203,8 @@ extern "C" void cuda_boolean_nand_32(

  int8_t *pbs_buffer = nullptr;
  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
-      input_lwe_ciphertext_count, max_shared_memory, true);
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -311,8 +311,8 @@ extern "C" void cuda_boolean_nor_32(

  int8_t *pbs_buffer = nullptr;
  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
-      input_lwe_ciphertext_count, max_shared_memory, true);
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -411,8 +411,8 @@ extern "C" void cuda_boolean_or_32(

  int8_t *pbs_buffer = nullptr;
  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
-      input_lwe_ciphertext_count, max_shared_memory, true);
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
      lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -532,8 +532,8 @@ extern "C" void cuda_boolean_xor_32(

  int8_t *pbs_buffer = nullptr;
  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
-      input_lwe_ciphertext_count, max_shared_memory, true);
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
@@ -660,8 +660,8 @@ extern "C" void cuda_boolean_xnor_32(

  int8_t *pbs_buffer = nullptr;
  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, pbs_level_count,
-      input_lwe_ciphertext_count, max_shared_memory, true);
+      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
+      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
      lwe_buffer_4, bootstrapping_key, pbs_buffer, input_lwe_dimension,
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
@@ -98,6 +98,7 @@ uint64_t get_buffer_size_bootstrap_low_latency_64(
          input_lwe_ciphertext_count, max_shared_memory);
    break;
  default:
+    return 0;
    break;
  }
 }
--- a/backends/concrete-cuda/implementation/src/polynomial/functions.cuh
+++ b/backends/concrete-cuda/implementation/src/polynomial/functions.cuh
@@ -3,6 +3,9 @@
 #include "device.h"
 #include "utils/timer.cuh"

+// Return A if C == 0 and B if C == 1
+#define SEL(A, B, C) ((-(C) & ((A) ^ (B))) ^ (A))
+
 /*
 *  function compresses decomposed buffer into half size complex buffer for fft
 */
@@ -83,18 +86,22 @@ __device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
      tid = threadIdx.x;
      for (int i = 0; i < elems_per_thread; i++) {
        if (j < degree) {
-          if (tid < degree - j) {
-            accumulator_slice[tid] = input_slice[tid + j];
-          } else {
-            accumulator_slice[tid] = -input_slice[tid - degree + j];
-          }
+          // if (tid < degree - j)
+          //  accumulator_slice[tid] = input_slice[tid + j];
+          // else
+          //  accumulator_slice[tid] = -input_slice[tid - degree + j];
+          int x = tid + j - SEL(degree, 0, tid < degree - j);
+          accumulator_slice[tid] =
+              SEL(-1, 1, tid < degree - j) * input_slice[x];
        } else {
-          uint32_t jj = j - degree;
-          if (tid < degree - jj) {
-            accumulator_slice[tid] = -input_slice[tid + jj];
-          } else {
-            accumulator_slice[tid] = input_slice[tid - degree + jj];
-          }
+          int32_t jj = j - degree;
+          // if (tid < degree - jj)
+          //  accumulator_slice[tid] = -input_slice[tid + jj];
+          // else
+          //  accumulator_slice[tid] = input_slice[tid - degree + jj];
+          int x = tid + jj - SEL(degree, 0, tid < degree - jj);
+          accumulator_slice[tid] =
+              SEL(1, -1, tid < degree - jj) * input_slice[x];
        }
        tid += block_size;
      }
@@ -120,19 +127,22 @@ __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
    int tid = threadIdx.x;
    for (int i = 0; i < elems_per_thread; i++) {
      if (j < degree) {
-        if (tid < j) {
-          result_acc_slice[tid] = -acc_slice[tid - j + degree] - acc_slice[tid];
-        } else {
-          result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
-        }
+        // if (tid < j)
+        //  result_acc_slice[tid] = -acc_slice[tid - j + degree]-acc_slice[tid];
+        // else
+        //  result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
+        int x = tid - j + SEL(0, degree, tid < j);
+        result_acc_slice[tid] =
+            SEL(1, -1, tid < j) * acc_slice[x] - acc_slice[tid];
      } else {
-        uint32_t jj = j - degree;
-        if (tid < jj) {
-          result_acc_slice[tid] = acc_slice[tid - jj + degree] - acc_slice[tid];
-
-        } else {
-          result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
-        }
+        int32_t jj = j - degree;
+        // if (tid < jj)
+        //  result_acc_slice[tid] = acc_slice[tid - jj + degree]-acc_slice[tid];
+        // else
+        //  result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
+        int x = tid - jj + SEL(0, degree, tid < jj);
+        result_acc_slice[tid] =
+            SEL(-1, 1, tid < jj) * acc_slice[x] - acc_slice[tid];
      }
      tid += block_size;
    }
@@ -249,18 +259,12 @@ __device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
    tid = threadIdx.x;
    result[params::opt];
    for (int i = 0; i < params::opt; i++) {
-      if (1 < params::degree) {
-        if (tid < 1)
-          result[i] = -accumulator_slice[tid - 1 + params::degree];
-        else
-          result[i] = accumulator_slice[tid - 1];
-      } else {
-        uint32_t jj = 1 - (uint32_t)params::degree;
-        if (tid < jj)
-          result[i] = accumulator_slice[tid - jj + params::degree];
-        else
-          result[i] = -accumulator_slice[tid - jj];
-      }
+      // if (tid < 1)
+      //  result[i] = -accumulator_slice[tid - 1 + params::degree];
+      // else
+      //  result[i] = accumulator_slice[tid - 1];
+      int x = tid - 1 + SEL(0, params::degree, tid < 1);
+      result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
      tid += params::degree / params::opt;
    }
    synchronize_threads_in_block();
--- a/backends/concrete-cuda/implementation/src/polynomial/polynomial.cuh
+++ b/backends/concrete-cuda/implementation/src/polynomial/polynomial.cuh
@@ -71,132 +71,6 @@ public:
    }
  }

-  __device__ void multiply_by_monomial_negacyclic(Polynomial<T, params> &result,
-                                                  uint32_t j) {
-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      if (j < params::degree) {
-        if (tid < j)
-          result.coefficients[tid] =
-              -this->coefficients[tid - j + params::degree];
-        else
-          result.coefficients[tid] = this->coefficients[tid - j];
-      } else {
-        uint32_t jj = j - params::degree;
-        if (tid < jj)
-          result.coefficients[tid] =
-              this->coefficients[tid - jj + params::degree];
-        else
-          result.coefficients[tid] = -this->coefficients[tid - jj];
-      }
-      tid += params::degree / params::opt;
-    }
-  }
-
-  __device__ void multiply_by_monomial_negacyclic_inplace(uint32_t j) {
-    int tid = threadIdx.x;
-    T result[params::opt];
-    for (int i = 0; i < params::opt; i++) {
-      if (j < params::degree) {
-        if (tid < j)
-          result[i] = -this->coefficients[tid - j + params::degree];
-        else
-          result[i] = this->coefficients[tid - j];
-      } else {
-        uint32_t jj = j - params::degree;
-        if (tid < jj)
-          result[i] = this->coefficients[tid - jj + params::degree];
-        else
-          result[i] = -this->coefficients[tid - jj];
-      }
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      coefficients[tid] = result[i];
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
-  __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
-      Polynomial<T, params> &result, uint32_t j) {
-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      if (j < params::degree) {
-        if (tid < j)
-          result.coefficients[tid] =
-              -this->coefficients[tid - j + params::degree] -
-              this->coefficients[tid];
-        else
-          result.coefficients[tid] =
-              this->coefficients[tid - j] - this->coefficients[tid];
-      } else {
-        uint32_t jj = j - params::degree;
-        if (tid < jj)
-          result.coefficients[tid] =
-              this->coefficients[tid - jj + params::degree] -
-              this->coefficients[tid];
-        else
-          result.coefficients[tid] =
-              -this->coefficients[tid - jj] - this->coefficients[tid];
-      }
-      tid += params::degree / params::opt;
-    }
-  }
-
-  __device__ void divide_by_monomial_negacyclic(Polynomial<T, params> &result,
-                                                uint32_t j) {
-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      if (j < params::degree) {
-        if (tid < params::degree - j) {
-          result.coefficients[tid] = this->coefficients[tid + j];
-        } else {
-          result.coefficients[tid] =
-              -this->coefficients[tid - params::degree + j];
-        }
-      } else {
-        uint32_t jj = j - params::degree;
-        if (tid < params::degree - jj) {
-          result.coefficients[tid] = -this->coefficients[tid + jj];
-        } else {
-          result.coefficients[tid] =
-              this->coefficients[tid - params::degree + jj];
-        }
-      }
-      tid += params::degree / params::opt;
-    }
-  }
-
-  __device__ void divide_by_monomial_negacyclic_inplace(uint32_t j) {
-    int tid = threadIdx.x;
-    T result[params::opt];
-    for (int i = 0; i < params::opt; i++) {
-      if (j < params::degree) {
-        if (tid < params::degree - j) {
-          result[i] = this->coefficients[tid + j];
-        } else {
-          result[i] = -this->coefficients[tid - params::degree + j];
-        }
-      } else {
-        uint32_t jj = j - params::degree;
-        if (tid < params::degree - jj) {
-          result[i] = -this->coefficients[tid + jj];
-        } else {
-          result[i] = this->coefficients[tid - params::degree + jj];
-        }
-      }
-      tid += params::degree / params::opt;
-    }
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      coefficients[tid] = result[i];
-      tid = tid + params::degree / params::opt;
-    }
-  }
-
  __device__ void round_to_closest_multiple_inplace(uint32_t base_log,
                                                    uint32_t level_count) {
    int tid = threadIdx.x;