chore(cuda): replace casts with cuda intrinsics

2026-02-08 11:35:02 -05:00 · 2022-11-04 11:45:48 +01:00
parent da654ee9cb
commit f36f565b75
5 changed files with 73 additions and 79 deletions
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -103,9 +103,9 @@ __global__ void device_bootstrap_amortized(
  GadgetMatrix<Torus, params> gadget(base_log, level_count);

  // Put "b", the body, in [0, 2N[
-  Torus b_hat = rescale_torus_element(
-      block_lwe_array_in[lwe_dimension],
-      2 * params::degree); // 2 * params::log2_degree + 1);
+  Torus b_hat = 0;
+  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                        2 * params::degree); // 2 * params::log2_degree + 1);

  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                        params::degree / params::opt>(
@@ -123,9 +123,9 @@ __global__ void device_bootstrap_amortized(
    synchronize_threads_in_block();

    // Put "a" in [0, 2N[ instead of Zq
-    Torus a_hat = rescale_torus_element(
-        block_lwe_array_in[iteration],
-        2 * params::degree); // 2 * params::log2_degree + 1);
+    Torus a_hat = 0;
+    rescale_torus_element(block_lwe_array_in[iteration], a_hat,
+                          2 * params::degree); // 2 * params::log2_degree + 1);

    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
@@ -176,8 +176,8 @@ __global__ void device_bootstrap_amortized(

      // Reduce the size of the FFT to be performed by storing
      // the real-valued polynomial into a complex polynomial
-      real_to_complex_compressed<int16_t, params>(accumulator_mask_decomposed,
-                                                  accumulator_fft);
+      real_to_complex_compressed<params>(accumulator_mask_decomposed,
+                                         accumulator_fft);

      synchronize_threads_in_block();
      // Switch to the FFT space
@@ -208,8 +208,8 @@ __global__ void device_bootstrap_amortized(

      // Now handle the polynomial multiplication for the body
      // in the same way
-      real_to_complex_compressed<int16_t, params>(accumulator_body_decomposed,
-                                                  accumulator_fft);
+      real_to_complex_compressed<params>(accumulator_body_decomposed,
+                                         accumulator_fft);
      synchronize_threads_in_block();

      NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -36,7 +36,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed,
              int iteration, grid_group &grid) {

  // Put the decomposed GLWE sample in the Fourier domain
-  real_to_complex_compressed<int16_t, params>(glwe_decomposed, fft);
+  real_to_complex_compressed<params>(glwe_decomposed, fft);
  synchronize_threads_in_block();

  // Switch to the FFT space
@@ -184,8 +184,9 @@ __global__ void device_bootstrap_low_latency(
  GadgetMatrix<Torus, params> gadget(base_log, level_count);

  // Put "b" in [0, 2N[
-  Torus b_hat = rescale_torus_element(block_lwe_array_in[lwe_dimension],
-                                      2 * params::degree);
+  Torus b_hat = 0;
+  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
+                        2 * params::degree);

  if (blockIdx.y == 0) {
    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
@@ -201,9 +202,9 @@ __global__ void device_bootstrap_low_latency(
    synchronize_threads_in_block();

    // Put "a" in [0, 2N[
-    Torus a_hat = rescale_torus_element(
-        block_lwe_array_in[i],
-        2 * params::degree); // 2 * params::log2_degree + 1);
+    Torus a_hat = 0;
+    rescale_torus_element(block_lwe_array_in[i], a_hat,
+                          2 * params::degree); // 2 * params::log2_degree + 1);

    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -21,30 +21,12 @@
 #include "utils/memory.cuh"
 #include "utils/timer.cuh"

-template <typename T, class params>
-__device__ void fft(double2 *output, T *input) {
+template <class params> __device__ void fft(double2 *output, int16_t *input) {
  synchronize_threads_in_block();

  // Reduce the size of the FFT to be performed by storing
  // the real-valued polynomial into a complex polynomial
-  real_to_complex_compressed<T, params>(input, output);
-  synchronize_threads_in_block();
-
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(output);
-  synchronize_threads_in_block();
-
-  correction_direct_fft_inplace<params>(output);
-  synchronize_threads_in_block();
-}
-
-template <typename T, typename ST, class params>
-__device__ void fft(double2 *output, T *input) {
-  synchronize_threads_in_block();
-
-  // Reduce the size of the FFT to be performed by storing
-  // the real-valued polynomial into a complex polynomial
-  real_to_complex_compressed<T, ST, params>(input, output);
+  real_to_complex_compressed<params>(input, output);
  synchronize_threads_in_block();

  // Switch to the FFT space
@@ -154,7 +136,7 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,

    // First, perform the polynomial multiplication for the mask
    synchronize_threads_in_block();
-    fft<int16_t, params>(glwe_fft, glwe_mask_decomposed);
+    fft<params>(glwe_fft, glwe_mask_decomposed);

    // External product and accumulate
    // Get the piece necessary for the multiplication
@@ -175,7 +157,7 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,
    // Now handle the polynomial multiplication for the body
    // in the same way
    synchronize_threads_in_block();
-    fft<int16_t, params>(glwe_fft, glwe_body_decomposed);
+    fft<params>(glwe_fft, glwe_body_decomposed);

    // External product and accumulate
    // Get the piece necessary for the multiplication
--- a/src/crypto/torus.cuh
+++ b/src/crypto/torus.cuh
@@ -4,21 +4,26 @@
 #include "types/int128.cuh"
 #include <limits>

-template <typename Torus>
-__device__ inline Torus typecast_double_to_torus(double x) {
-  if constexpr (sizeof(Torus) < 8) {
-    // this simple cast works up to 32 bits, afterwards we must do it manually
-    long long ret = x;
-    return (Torus)ret;
-  } else {
-    int128 nnnn = make_int128_from_float(x);
-    uint64_t lll = nnnn.lo_;
-    return lll;
-  }
-  // nvcc doesn't get it that the if {} else {} above should always return
-  // something, and complains that this function might return nothing, so we
-  // put this useless return here
-  return 0;
+template <typename T>
+__device__ inline void typecast_double_to_torus(double x, T &r) {
+  r = T(x);
+}
+
+template <>
+__device__ inline void typecast_double_to_torus<uint32_t>(double x,
+                                                          uint32_t &r) {
+  r = __double2uint_rn(x);
+}
+
+template <>
+__device__ inline void typecast_double_to_torus<uint64_t>(double x,
+                                                          uint64_t &r) {
+  // The ull intrinsic does not behave in the same way on all architectures and
+  // on some platforms this causes the cmux tree test to fail
+  // Hence the intrinsic is not used here
+  uint128 nnnn = make_uint128_from_float(x);
+  uint64_t lll = nnnn.lo_;
+  r = lll;
 }

 template <typename T>
@@ -34,10 +39,30 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
 }

 template <typename T>
-__device__ __forceinline__ T rescale_torus_element(T element,
-                                                   uint32_t log_shift) {
-  return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
-               (double)log_shift);
+__device__ __forceinline__ void rescale_torus_element(T element, T &output,
+                                                      uint32_t log_shift) {
+  output =
+      round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
+            (double)log_shift);
+  ;
 }

+template <>
+__device__ __forceinline__ void
+rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
+                                uint32_t log_shift) {
+  output =
+      round(__uint2double_rn(element) /
+            (__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
+            __uint2double_rn(log_shift));
+}
+
+template <>
+__device__ __forceinline__ void
+rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
+                                uint32_t log_shift) {
+  output = round(__ull2double_rn(element) /
+                 (__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
+                 __uint2double_rn(log_shift));
+}
 #endif // CNCRT_TORUS_H
--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -6,29 +6,13 @@
 /*
 *  function compresses decomposed buffer into half size complex buffer for fft
 */
-template <typename T, class params>
-__device__ void real_to_complex_compressed(T *src, double2 *dst) {
+template <class params>
+__device__ void real_to_complex_compressed(int16_t *src, double2 *dst) {
  int tid = threadIdx.x;
 #pragma unroll
  for (int i = 0; i < params::opt / 2; i++) {
-    dst[tid].x = (double)src[2 * tid];
-    dst[tid].y = (double)src[2 * tid + 1];
-    tid += params::degree / params::opt;
-  }
-}
-
-template <typename T, typename ST, class params>
-__device__ void real_to_complex_compressed(T *src, double2 *dst) {
-  int tid = threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    ST x = src[2 * tid];
-    ST y = src[2 * tid + 1];
-
-    dst[tid].x = x / (double)std::numeric_limits<T>::max();
-    dst[tid].y = y / (double)std::numeric_limits<T>::max();
-
+    dst[tid].x = __int2double_rn(src[2 * tid]);
+    dst[tid].y = __int2double_rn(src[2 * tid + 1]);
    tid += params::degree / params::opt;
  }
 }
@@ -175,14 +159,16 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
    double carry = frac - floor(frac);
    frac += (carry >= 0.5);

-    Torus V1 = typecast_double_to_torus<Torus>(frac);
+    Torus V1 = 0;
+    typecast_double_to_torus<Torus>(frac, V1);

    frac = v2 - floor(v2);
    frac *= mx;
    carry = frac - floor(v2);
    frac += (carry >= 0.5);

-    Torus V2 = typecast_double_to_torus<Torus>(frac);
+    Torus V2 = 0;
+    typecast_double_to_torus<Torus>(frac, V2);

    result[tid * 2] += V1;
    result[tid * 2 + 1] += V2;