From f36f565b758874de971c3143d54476e8c3f10faf Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Fri, 4 Nov 2022 11:45:48 +0100 Subject: [PATCH] chore(cuda): replace casts with cuda intrinsics --- src/bootstrap_amortized.cuh | 20 +++++------ src/bootstrap_low_latency.cuh | 13 ++++---- src/bootstrap_wop.cuh | 26 +++------------ src/crypto/torus.cuh | 63 ++++++++++++++++++++++++----------- src/polynomial/functions.cuh | 30 +++++------------ 5 files changed, 73 insertions(+), 79 deletions(-) diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 714ed760d..6fcd8a51e 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -103,9 +103,9 @@ __global__ void device_bootstrap_amortized( GadgetMatrix gadget(base_log, level_count); // Put "b", the body, in [0, 2N[ - Torus b_hat = rescale_torus_element( - block_lwe_array_in[lwe_dimension], - 2 * params::degree); // 2 * params::log2_degree + 1); + Torus b_hat = 0; + rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, + 2 * params::degree); // 2 * params::log2_degree + 1); divide_by_monomial_negacyclic_inplace( @@ -123,9 +123,9 @@ __global__ void device_bootstrap_amortized( synchronize_threads_in_block(); // Put "a" in [0, 2N[ instead of Zq - Torus a_hat = rescale_torus_element( - block_lwe_array_in[iteration], - 2 * params::degree); // 2 * params::log2_degree + 1); + Torus a_hat = 0; + rescale_torus_element(block_lwe_array_in[iteration], a_hat, + 2 * params::degree); // 2 * params::log2_degree + 1); // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< @@ -176,8 +176,8 @@ __global__ void device_bootstrap_amortized( // Reduce the size of the FFT to be performed by storing // the real-valued polynomial into a complex polynomial - real_to_complex_compressed(accumulator_mask_decomposed, - accumulator_fft); + real_to_complex_compressed(accumulator_mask_decomposed, + accumulator_fft); synchronize_threads_in_block(); // Switch to the FFT space @@ -208,8 +208,8 @@ __global__ void device_bootstrap_amortized( // Now handle the polynomial multiplication for the body // in the same way - real_to_complex_compressed(accumulator_body_decomposed, - accumulator_fft); + real_to_complex_compressed(accumulator_body_decomposed, + accumulator_fft); synchronize_threads_in_block(); NSMFFT_direct>(accumulator_fft); diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index e9335d593..7bd73fa8f 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -36,7 +36,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed, int iteration, grid_group &grid) { // Put the decomposed GLWE sample in the Fourier domain - real_to_complex_compressed(glwe_decomposed, fft); + real_to_complex_compressed(glwe_decomposed, fft); synchronize_threads_in_block(); // Switch to the FFT space @@ -184,8 +184,9 @@ __global__ void device_bootstrap_low_latency( GadgetMatrix gadget(base_log, level_count); // Put "b" in [0, 2N[ - Torus b_hat = rescale_torus_element(block_lwe_array_in[lwe_dimension], - 2 * params::degree); + Torus b_hat = 0; + rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat, + 2 * params::degree); if (blockIdx.y == 0) { divide_by_monomial_negacyclic_inplace -__device__ void fft(double2 *output, T *input) { +template __device__ void fft(double2 *output, int16_t *input) { synchronize_threads_in_block(); // Reduce the size of the FFT to be performed by storing // the real-valued polynomial into a complex polynomial - real_to_complex_compressed(input, output); - synchronize_threads_in_block(); - - // Switch to the FFT space - NSMFFT_direct>(output); - synchronize_threads_in_block(); - - correction_direct_fft_inplace(output); - synchronize_threads_in_block(); -} - -template -__device__ void fft(double2 *output, T *input) { - synchronize_threads_in_block(); - - // Reduce the size of the FFT to be performed by storing - // the real-valued polynomial into a complex polynomial - real_to_complex_compressed(input, output); + real_to_complex_compressed(input, output); synchronize_threads_in_block(); // Switch to the FFT space @@ -154,7 +136,7 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in, // First, perform the polynomial multiplication for the mask synchronize_threads_in_block(); - fft(glwe_fft, glwe_mask_decomposed); + fft(glwe_fft, glwe_mask_decomposed); // External product and accumulate // Get the piece necessary for the multiplication @@ -175,7 +157,7 @@ cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in, // Now handle the polynomial multiplication for the body // in the same way synchronize_threads_in_block(); - fft(glwe_fft, glwe_body_decomposed); + fft(glwe_fft, glwe_body_decomposed); // External product and accumulate // Get the piece necessary for the multiplication diff --git a/src/crypto/torus.cuh b/src/crypto/torus.cuh index a72627b47..a0ffd1180 100644 --- a/src/crypto/torus.cuh +++ b/src/crypto/torus.cuh @@ -4,21 +4,26 @@ #include "types/int128.cuh" #include -template -__device__ inline Torus typecast_double_to_torus(double x) { - if constexpr (sizeof(Torus) < 8) { - // this simple cast works up to 32 bits, afterwards we must do it manually - long long ret = x; - return (Torus)ret; - } else { - int128 nnnn = make_int128_from_float(x); - uint64_t lll = nnnn.lo_; - return lll; - } - // nvcc doesn't get it that the if {} else {} above should always return - // something, and complains that this function might return nothing, so we - // put this useless return here - return 0; +template +__device__ inline void typecast_double_to_torus(double x, T &r) { + r = T(x); +} + +template <> +__device__ inline void typecast_double_to_torus(double x, + uint32_t &r) { + r = __double2uint_rn(x); +} + +template <> +__device__ inline void typecast_double_to_torus(double x, + uint64_t &r) { + // The ull intrinsic does not behave in the same way on all architectures and + // on some platforms this causes the cmux tree test to fail + // Hence the intrinsic is not used here + uint128 nnnn = make_uint128_from_float(x); + uint64_t lll = nnnn.lo_; + r = lll; } template @@ -34,10 +39,30 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log, } template -__device__ __forceinline__ T rescale_torus_element(T element, - uint32_t log_shift) { - return round((double)element / (double(std::numeric_limits::max()) + 1.0) * - (double)log_shift); +__device__ __forceinline__ void rescale_torus_element(T element, T &output, + uint32_t log_shift) { + output = + round((double)element / (double(std::numeric_limits::max()) + 1.0) * + (double)log_shift); + ; } +template <> +__device__ __forceinline__ void +rescale_torus_element(uint32_t element, uint32_t &output, + uint32_t log_shift) { + output = + round(__uint2double_rn(element) / + (__uint2double_rn(std::numeric_limits::max()) + 1.0) * + __uint2double_rn(log_shift)); +} + +template <> +__device__ __forceinline__ void +rescale_torus_element(uint64_t element, uint64_t &output, + uint32_t log_shift) { + output = round(__ull2double_rn(element) / + (__ull2double_rn(std::numeric_limits::max()) + 1.0) * + __uint2double_rn(log_shift)); +} #endif // CNCRT_TORUS_H \ No newline at end of file diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh index c594b8deb..fa14a98a7 100644 --- a/src/polynomial/functions.cuh +++ b/src/polynomial/functions.cuh @@ -6,29 +6,13 @@ /* * function compresses decomposed buffer into half size complex buffer for fft */ -template -__device__ void real_to_complex_compressed(T *src, double2 *dst) { +template +__device__ void real_to_complex_compressed(int16_t *src, double2 *dst) { int tid = threadIdx.x; #pragma unroll for (int i = 0; i < params::opt / 2; i++) { - dst[tid].x = (double)src[2 * tid]; - dst[tid].y = (double)src[2 * tid + 1]; - tid += params::degree / params::opt; - } -} - -template -__device__ void real_to_complex_compressed(T *src, double2 *dst) { - int tid = threadIdx.x; - -#pragma unroll - for (int i = 0; i < params::opt / 2; i++) { - ST x = src[2 * tid]; - ST y = src[2 * tid + 1]; - - dst[tid].x = x / (double)std::numeric_limits::max(); - dst[tid].y = y / (double)std::numeric_limits::max(); - + dst[tid].x = __int2double_rn(src[2 * tid]); + dst[tid].y = __int2double_rn(src[2 * tid + 1]); tid += params::degree / params::opt; } } @@ -175,14 +159,16 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) { double carry = frac - floor(frac); frac += (carry >= 0.5); - Torus V1 = typecast_double_to_torus(frac); + Torus V1 = 0; + typecast_double_to_torus(frac, V1); frac = v2 - floor(v2); frac *= mx; carry = frac - floor(v2); frac += (carry >= 0.5); - Torus V2 = typecast_double_to_torus(frac); + Torus V2 = 0; + typecast_double_to_torus(frac, V2); result[tid * 2] += V1; result[tid * 2 + 1] += V2;