mirror of
https://github.com/zama-ai/tfhe-rs.git
synced 2026-01-15 01:28:20 -05:00
Compare commits
3 Commits
pa/paralle
...
feat/gpu/f
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d45cb74476 | ||
|
|
772c049681 | ||
|
|
516ae67990 |
@@ -62,6 +62,7 @@ fn main() {
|
||||
"cuda/include/integer/integer.h",
|
||||
"cuda/include/keyswitch.h",
|
||||
"cuda/include/linear_algebra.h",
|
||||
"cuda/include/pbs/fft.h",
|
||||
"cuda/include/pbs/programmable_bootstrap.h",
|
||||
"cuda/include/pbs/programmable_bootstrap_multibit.h",
|
||||
];
|
||||
|
||||
@@ -83,7 +83,7 @@ endif()
|
||||
set(CMAKE_CUDA_FLAGS
|
||||
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
|
||||
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
|
||||
--use_fast_math -Xcompiler -fPIC")
|
||||
--use_fast_math -Xcompiler -fPIC --ptxas-options=-v")
|
||||
|
||||
set(INCLUDE_DIR include)
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index);
|
||||
|
||||
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,
|
||||
|
||||
17
backends/tfhe-cuda-backend/cuda/include/pbs/fft.h
Normal file
17
backends/tfhe-cuda-backend/cuda/include/pbs/fft.h
Normal file
@@ -0,0 +1,17 @@
|
||||
#include <stdint.h>
|
||||
extern "C" {
|
||||
void cuda_fourier_transform_forward_as_torus_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
|
||||
void *im1, void const *standard, uint32_t const N,
|
||||
const uint32_t number_of_samples);
|
||||
|
||||
void cuda_fourier_transform_forward_as_integer_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
|
||||
void *im1, void const *standard, uint32_t const N,
|
||||
const uint32_t number_of_samples);
|
||||
|
||||
void cuda_fourier_transform_backward_as_torus_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *standard, void const *re0,
|
||||
void const *re1, void const *im0, void const *im1, uint32_t const N,
|
||||
const uint32_t number_of_samples);
|
||||
}
|
||||
@@ -97,7 +97,7 @@ bool cuda_check_support_thread_block_clusters() {
|
||||
}
|
||||
|
||||
/// Copy memory to the GPU asynchronously
|
||||
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
|
||||
cudaStream_t stream, uint32_t gpu_index) {
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
370
backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
Normal file
370
backends/tfhe-cuda-backend/cuda/src/fft128/f128.cuh
Normal file
@@ -0,0 +1,370 @@
|
||||
|
||||
#ifndef TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_F128_CUH_
|
||||
#define TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_F128_CUH_
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
struct alignas(16) f128 {
|
||||
double hi;
|
||||
double lo;
|
||||
|
||||
// Default and parameterized constructors
|
||||
__host__ __device__ f128() : hi(0.0), lo(0.0) {}
|
||||
__host__ __device__ f128(double high, double low) : hi(high), lo(low) {}
|
||||
|
||||
// Quick two-sum
|
||||
__host__ __device__ __forceinline__ static f128 quick_two_sum(double a,
|
||||
double b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
double s = __dadd_rn(a, b);
|
||||
return f128(s, __dsub_rn(b, __dsub_rn(s, a)));
|
||||
#else
|
||||
double s = a + b;
|
||||
return f128(s, b - (s - a));
|
||||
#endif;
|
||||
}
|
||||
|
||||
// Two-sum
|
||||
__host__ __device__ __forceinline__ static f128 two_sum(double a, double b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
double s = __dadd_rn(a, b);
|
||||
double bb = __dsub_rn(s, a);
|
||||
return f128(s, __dadd_rn(__dsub_rn(a, __dsub_rn(s, bb)), __dsub_rn(b, bb)));
|
||||
#else
|
||||
double s = a + b;
|
||||
double bb = s - a;
|
||||
return f128(s, (a - (s - bb)) + (b - bb));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Two-product
|
||||
__host__ __device__ __forceinline__ static f128 two_prod(double a, double b) {
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
double p = __dmul_rn(a, b);
|
||||
double p2 = __fma_rn(a, b, -p);
|
||||
#else
|
||||
double p = a * b;
|
||||
double p2 = fma(a, b, -p);
|
||||
#endif
|
||||
return f128(p, p2);
|
||||
}
|
||||
|
||||
__host__ __device__ __forceinline__ static f128 two_diff(double a, double b) {
|
||||
#ifdef __CUDA_ARCH__
|
||||
double s = __dsub_rn(a, b);
|
||||
double bb = __dsub_rn(s, a);
|
||||
return f128(s, __dsub_rn(__dsub_rn(a, __dsub_rn(s, bb)), __dadd_rn(b, bb)));
|
||||
#else
|
||||
double s = a - b;
|
||||
double bb = s - a;
|
||||
return f128(s, (a - (s - bb)) - (b + bb));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Addition
|
||||
__host__ __device__ static f128 add(const f128 &a, const f128 &b) {
|
||||
auto s = two_sum(a.hi, b.hi);
|
||||
auto t = two_sum(a.lo, b.lo);
|
||||
|
||||
double hi = s.hi;
|
||||
double lo = s.lo + t.hi;
|
||||
hi = hi + lo;
|
||||
lo = lo - (hi - s.hi);
|
||||
|
||||
return f128(hi, lo + t.lo);
|
||||
}
|
||||
|
||||
// Addition with estimate
|
||||
__host__ __device__ static f128 add_estimate(const f128 &a, const f128 &b) {
|
||||
auto se = two_sum(a.hi, b.hi);
|
||||
#ifdef __CUDA_ARCH__
|
||||
se.lo = __dadd_rn(se.lo, __dadd_rn(a.lo, b.lo));
|
||||
#else
|
||||
se.lo += (a.lo + b.lo);
|
||||
#endif
|
||||
|
||||
return quick_two_sum(se.hi, se.lo);
|
||||
}
|
||||
|
||||
// Subtraction with estimate
|
||||
__host__ __device__ static f128 sub_estimate(const f128 &a, const f128 &b) {
|
||||
f128 se = two_diff(a.hi, b.hi);
|
||||
#ifdef __CUDA_ARCH__
|
||||
se.lo = __dadd_rn(se.lo, a.lo);
|
||||
se.lo = __dsub_rn(se.lo, b.lo);
|
||||
#else
|
||||
se.lo += a.lo;
|
||||
se.lo -= b.lo;
|
||||
#endif
|
||||
return quick_two_sum(se.hi, se.lo);
|
||||
}
|
||||
|
||||
// Subtraction
|
||||
__host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
|
||||
auto s = two_diff(a.hi, b.hi);
|
||||
auto t = two_diff(a.lo, b.lo);
|
||||
s = quick_two_sum(s.hi, s.lo + t.hi);
|
||||
return quick_two_sum(s.hi, s.lo + t.lo);
|
||||
}
|
||||
|
||||
// Multiplication
|
||||
__host__ __device__ static f128 mul(const f128 &a, const f128 &b) {
|
||||
auto p = two_prod(a.hi, b.hi);
|
||||
#ifdef __CUDA_ARCH__
|
||||
double a_0_x_b_1 = __dmul_rn(a.hi, b.lo);
|
||||
double a_1_x_b_0 = __dmul_rn(a.lo, b.hi);
|
||||
p.lo = __dadd_rn(p.lo, __dadd_rn(a_0_x_b_1, a_1_x_b_0));
|
||||
#else
|
||||
p.lo += (a.hi * b.lo + a.lo * b.hi);
|
||||
#endif
|
||||
p = quick_two_sum(p.hi, p.lo);
|
||||
return p;
|
||||
}
|
||||
|
||||
__host__ __device__ static void
|
||||
cplx_f128_mul_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
|
||||
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
|
||||
auto a_re_x_b_re = mul(a_re, b_re);
|
||||
auto a_re_x_b_im = mul(a_re, b_im);
|
||||
auto a_im_x_b_re = mul(a_im, b_re);
|
||||
auto a_im_x_b_im = mul(a_im, b_im);
|
||||
|
||||
c_re = sub_estimate(a_re_x_b_re, a_im_x_b_im);
|
||||
c_im = add_estimate(a_im_x_b_re, a_re_x_b_im);
|
||||
}
|
||||
|
||||
__host__ __device__ static void
|
||||
cplx_f128_sub_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
|
||||
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
|
||||
c_re = sub_estimate(a_re, b_re);
|
||||
c_im = sub_estimate(a_im, b_im);
|
||||
}
|
||||
__host__ __device__ static void
|
||||
cplx_f128_add_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
|
||||
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
|
||||
c_re = add_estimate(a_re, b_re);
|
||||
c_im = add_estimate(a_im, b_im);
|
||||
}
|
||||
};
|
||||
|
||||
struct f128x2 {
|
||||
f128 re;
|
||||
f128 im;
|
||||
|
||||
__host__ __device__ f128x2() : re(), im() {}
|
||||
|
||||
__host__ __device__ f128x2(const f128 &real, const f128 &imag)
|
||||
: re(real), im(imag) {}
|
||||
|
||||
__host__ __device__ f128x2(double real, double imag)
|
||||
: re(real, 0.0), im(imag, 0.0) {}
|
||||
|
||||
__host__ __device__ explicit f128x2(double real)
|
||||
: re(real, 0.0), im(0.0, 0.0) {}
|
||||
|
||||
__host__ __device__ f128x2(const f128x2 &other)
|
||||
: re(other.re), im(other.im) {}
|
||||
|
||||
__host__ __device__ f128x2(f128x2 &&other) noexcept
|
||||
: re(std::move(other.re)), im(std::move(other.im)) {}
|
||||
|
||||
__host__ __device__ f128x2 &operator=(const f128x2 &other) {
|
||||
if (this != &other) {
|
||||
re = other.re;
|
||||
im = other.im;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ f128x2 &operator=(f128x2 &&other) noexcept {
|
||||
if (this != &other) {
|
||||
re = std::move(other.re);
|
||||
im = std::move(other.im);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
__host__ __device__ f128x2 conjugate() const {
|
||||
return f128x2(re, f128(-im.hi, -im.lo));
|
||||
}
|
||||
|
||||
__host__ __device__ f128 norm_squared() const {
|
||||
return f128::add(f128::mul(re, re), f128::mul(im, im));
|
||||
}
|
||||
|
||||
__host__ __device__ void zero() {
|
||||
re = f128(0.0, 0.0);
|
||||
im = f128(0.0, 0.0);
|
||||
}
|
||||
|
||||
// Addition
|
||||
__host__ __device__ friend f128x2 operator+(const f128x2 &a,
|
||||
const f128x2 &b) {
|
||||
return f128x2(f128::add(a.re, b.re), f128::add(a.im, b.im));
|
||||
}
|
||||
|
||||
// Subtraction
|
||||
__host__ __device__ friend f128x2 operator-(const f128x2 &a,
|
||||
const f128x2 &b) {
|
||||
return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
|
||||
f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
|
||||
}
|
||||
|
||||
// Multiplication (complex multiplication)
|
||||
__host__ __device__ friend f128x2 operator*(const f128x2 &a,
|
||||
const f128x2 &b) {
|
||||
f128 real_part =
|
||||
f128::add(f128::mul(a.re, b.re),
|
||||
f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
|
||||
f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
|
||||
return f128x2(real_part, imag_part);
|
||||
}
|
||||
|
||||
// Addition-assignment operator
|
||||
__host__ __device__ f128x2 &operator+=(const f128x2 &other) {
|
||||
re = f128::add(re, other.re);
|
||||
im = f128::add(im, other.im);
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Subtraction-assignment operator
|
||||
__host__ __device__ f128x2 &operator-=(const f128x2 &other) {
|
||||
re = f128::add(re, f128(-other.re.hi, -other.re.lo));
|
||||
im = f128::add(im, f128(-other.im.hi, -other.im.lo));
|
||||
return *this;
|
||||
}
|
||||
|
||||
// Multiplication-assignment operator
|
||||
__host__ __device__ f128x2 &operator*=(const f128x2 &other) {
|
||||
f128 new_re =
|
||||
f128::add(f128::mul(re, other.re), f128(-f128::mul(im, other.im).hi,
|
||||
-f128::mul(im, other.im).lo));
|
||||
f128 new_im = f128::add(f128::mul(re, other.im), f128::mul(im, other.re));
|
||||
re = new_re;
|
||||
im = new_im;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
__host__ __device__ inline uint64_t double_to_bits(double d) {
|
||||
uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
|
||||
return bits;
|
||||
}
|
||||
|
||||
__host__ __device__ inline double bits_to_double(uint64_t bits) {
|
||||
double d = *reinterpret_cast<double *>(&bits);
|
||||
return d;
|
||||
}
|
||||
|
||||
__host__ __device__ double u128_to_f64(__uint128_t x) {
|
||||
const __uint128_t ONE = 1;
|
||||
const double A = ONE << 52;
|
||||
const double B = ONE << 104;
|
||||
const double C = ONE << 76;
|
||||
const double D = 340282366920938500000000000000000000000.;
|
||||
|
||||
const __uint128_t threshold = (ONE << 104);
|
||||
|
||||
if (x < threshold) {
|
||||
uint64_t A_bits = double_to_bits(A);
|
||||
|
||||
__uint128_t shifted = (x << 12);
|
||||
uint64_t lower64 = static_cast<uint64_t>(shifted);
|
||||
lower64 >>= 12;
|
||||
|
||||
uint64_t bits_l = A_bits | lower64;
|
||||
double l_temp = bits_to_double(bits_l);
|
||||
double l = l_temp - A;
|
||||
|
||||
uint64_t B_bits = double_to_bits(B);
|
||||
uint64_t top64 = static_cast<uint64_t>(x >> 52);
|
||||
uint64_t bits_h = B_bits | top64;
|
||||
double h_temp = bits_to_double(bits_h);
|
||||
double h = h_temp - B;
|
||||
|
||||
return (l + h);
|
||||
|
||||
} else {
|
||||
uint64_t C_bits = double_to_bits(C);
|
||||
|
||||
__uint128_t shifted = (x >> 12);
|
||||
uint64_t lower64 = static_cast<uint64_t>(shifted);
|
||||
lower64 >>= 12;
|
||||
|
||||
uint64_t x_lo = static_cast<uint64_t>(x);
|
||||
uint64_t mask_part = (x_lo & 0xFFFFFFULL);
|
||||
|
||||
uint64_t bits_l = C_bits | lower64 | mask_part;
|
||||
double l_temp = bits_to_double(bits_l);
|
||||
double l = l_temp - C;
|
||||
|
||||
uint64_t D_bits = double_to_bits(D);
|
||||
uint64_t top64 = static_cast<uint64_t>(x >> 76);
|
||||
uint64_t bits_h = D_bits | top64;
|
||||
double h_temp = bits_to_double(bits_h);
|
||||
double h = h_temp - D;
|
||||
|
||||
return (l + h);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ __uint128_t f64_to_u128(const double f) {
|
||||
const __uint128_t ONE = 1;
|
||||
const uint64_t f_bits = double_to_bits(f);
|
||||
if (f_bits < 1023ull << 52) {
|
||||
return 0;
|
||||
} else {
|
||||
const __uint128_t m = ONE << 127 | (__uint128_t)f_bits << 75;
|
||||
const uint64_t s = 1150 - (f_bits >> 52);
|
||||
if (s >= 128) {
|
||||
return 0;
|
||||
} else {
|
||||
return m >> s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ __device__ double i128_to_f64(__int128_t const x) {
|
||||
uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
|
||||
__uint128_t abs =
|
||||
(x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);
|
||||
|
||||
return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
|
||||
}
|
||||
__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
|
||||
const double first_approx = i128_to_f64(x);
|
||||
const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
|
||||
const __uint128_t first_approx_roundtrip =
|
||||
f64_to_u128((first_approx < 0) ? -first_approx : first_approx);
|
||||
const __uint128_t first_approx_roundtrip_signed =
|
||||
(sign_bit == (1ull << 63)) ? -first_approx_roundtrip
|
||||
: first_approx_roundtrip;
|
||||
|
||||
double correction = i128_to_f64(x - first_approx_roundtrip_signed);
|
||||
|
||||
return f128(first_approx, correction);
|
||||
};
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
// Convert __uint128_t to decimal string
|
||||
std::string to_string_128(__uint128_t value) {
|
||||
if (value == 0)
|
||||
return "0";
|
||||
|
||||
std::string result;
|
||||
// Repeatedly divide by 10 and build the number in reverse
|
||||
while (value > 0) {
|
||||
unsigned digit = static_cast<unsigned>(value % 10);
|
||||
result.push_back(static_cast<char>('0' + digit));
|
||||
value /= 10;
|
||||
}
|
||||
|
||||
// The digits are in reverse order, so reverse them
|
||||
std::reverse(result.begin(), result.end());
|
||||
return result;
|
||||
} // TIP To <b>Run</b> code, press <shortcut actionId="Run"/> or
|
||||
|
||||
#endif
|
||||
163
backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cu
Normal file
163
backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cu
Normal file
@@ -0,0 +1,163 @@
|
||||
#include "fft128.cuh"
|
||||
|
||||
void cuda_fourier_transform_forward_as_integer_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
|
||||
void *im1, void const *standard, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
switch (N) {
|
||||
case 64:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<64>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 128:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<128>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 256:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_fourier_transform_forward_as_integer_f128<Degree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_fourier_transform_forward_as_torus_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
|
||||
void *im1, void const *standard, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
switch (N) {
|
||||
case 64:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<64>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 128:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<128>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 256:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_fourier_transform_forward_as_torus_f128<Degree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
|
||||
(double *)re1, (double *)im0, (double *)im1,
|
||||
(__uint128_t const *)standard, N, number_of_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_fourier_transform_backward_as_torus_f128_async(
|
||||
void *stream, uint32_t gpu_index, void *standard, void const *re0,
|
||||
void const *re1, void const *im0, void const *im1, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
switch (N) {
|
||||
case 64:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<64>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 128:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<128>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 256:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<256>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 512:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<512>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 1024:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<1024>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 2048:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<2048>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
case 4096:
|
||||
host_fourier_transform_backward_as_torus_f128<Degree<4096>>(
|
||||
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
|
||||
(double const *)re0, (double const *)re1, (double const *)im0,
|
||||
(double const *)im1, N, number_of_samples);
|
||||
break;
|
||||
default:
|
||||
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
|
||||
"N's are powers of two"
|
||||
" in the interval [256..4096].")
|
||||
}
|
||||
}
|
||||
760
backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
Normal file
760
backends/tfhe-cuda-backend/cuda/src/fft128/fft128.cuh
Normal file
@@ -0,0 +1,760 @@
|
||||
#ifndef TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
|
||||
#define TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
|
||||
|
||||
#include "f128.cuh"
|
||||
#include "pbs/fft.h"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "twiddles.cuh"
|
||||
#include "types/complex/operations.cuh"
|
||||
#include <iostream>
|
||||
|
||||
using Index = unsigned;
|
||||
|
||||
#define NEG_TWID(i) \
|
||||
f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]), \
|
||||
f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
|
||||
|
||||
#define F64x4_TO_F128x2(f128x2_reg, ind) \
|
||||
f128x2_reg.re.hi = dt_re_hi[ind]; \
|
||||
f128x2_reg.re.lo = dt_re_lo[ind]; \
|
||||
f128x2_reg.im.hi = dt_im_hi[ind]; \
|
||||
f128x2_reg.im.lo = dt_im_lo[ind]
|
||||
|
||||
#define F128x2_TO_F64x4(f128x2_reg, ind) \
|
||||
dt_re_hi[ind] = f128x2_reg.re.hi; \
|
||||
dt_re_lo[ind] = f128x2_reg.re.lo; \
|
||||
dt_im_hi[ind] = f128x2_reg.im.hi; \
|
||||
dt_im_lo[ind] = f128x2_reg.im.lo
|
||||
|
||||
// zl - left part of butterfly operation
|
||||
// zr - right part of butterfly operation
|
||||
// re - real part
|
||||
// im - imaginary part
|
||||
// hi - high bits
|
||||
// lo - low bits
|
||||
// dt - list
|
||||
// cf - single coefficient
|
||||
template <class params>
|
||||
__device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
|
||||
double *dt_im_hi,
|
||||
double *dt_im_lo) {
|
||||
|
||||
__syncthreads();
|
||||
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
|
||||
constexpr Index LOG2_DEGREE = params::log2_degree;
|
||||
constexpr Index HALF_DEGREE = params::degree >> 1;
|
||||
constexpr Index STRIDE = params::degree / params::opt;
|
||||
|
||||
f128x2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
|
||||
// debug
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
printf("BUTTERFLY_DEPTH %d\n", BUTTERFLY_DEPTH);
|
||||
printf("LOG2_DEGREE %d\n", LOG2_DEGREE);
|
||||
printf("HALF_DEGREE %d\n", HALF_DEGREE);
|
||||
printf("STRIDE %d\n", STRIDE);
|
||||
printf("Params::degree %d\n", params::degree);
|
||||
printf("opt %d\n", params::opt);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// load into registers
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
F64x4_TO_F128x2(u[i], tid);
|
||||
F64x4_TO_F128x2(v[i], tid + HALF_DEGREE);
|
||||
// u[i].re.hi = dt_re_hi[tid];
|
||||
// u[i].re.lo = dt_re_lo[tid];
|
||||
// u[i].im.hi = dt_im_hi[tid];
|
||||
// u[i].im.lo = dt_im_lo[tid];
|
||||
|
||||
// v[i].re.hi = dt_re_hi[tid + HALF_DEGREE];
|
||||
// v[i].re.lo = dt_re_lo[tid + HALF_DEGREE];
|
||||
// v[i].im.hi = dt_im_hi[tid + HALF_DEGREE];
|
||||
// v[i].im.lo = dt_im_lo[tid + HALF_DEGREE];
|
||||
|
||||
// F64x4_TO_F128x2(u[i], tid);
|
||||
// F64x4_TO_F128x2(v[i], tid + HALF_DEGREE);
|
||||
tid += STRIDE;
|
||||
}
|
||||
|
||||
// level 1
|
||||
// we don't make actual complex multiplication on level1 since we have only
|
||||
// one twiddle, it's real and image parts are equal, so we can multiply
|
||||
// it with simpler operations
|
||||
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
auto ww = NEG_TWID(1);
|
||||
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, NEG_TWID(1).re,
|
||||
NEG_TWID(1).im);
|
||||
// w = v[i] * NEG_TWID(1);
|
||||
|
||||
// __syncthreads();
|
||||
// if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
// printf("w = %.5f %.5f %.5f %.5f\n", ww.re.hi, ww.re.lo, ww.im.hi,
|
||||
// ww.im.lo); printf("u = %.5f %.5f %.5f %.5f\n", u[i].re.hi,
|
||||
// u[i].re.lo, u[i].im.hi, u[i].im.lo); printf("v = %.5f %.5f %.5f
|
||||
// %.5f\n", v[i].re.hi, v[i].re.lo, v[i].im.hi, v[i].im.lo); printf("wv
|
||||
// = %.5f %.5f %.5f %.5f\n", w.re.hi, w.re.lo, w.im.hi, w.im.lo);
|
||||
// }
|
||||
// __syncthreads();
|
||||
// v[i] = u[i] - w;
|
||||
// u[i] = u[i] + w;
|
||||
|
||||
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re, w.im);
|
||||
f128::cplx_f128_add_assign(u[i].re, u[i].im, u[i].re, u[i].im, w.re, w.im);
|
||||
}
|
||||
|
||||
// tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
// for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
// F128x2_TO_F64x4(u[i], (tid));
|
||||
// F128x2_TO_F64x4(v[i], (tid + HALF_DEGREE));
|
||||
// tid = tid + STRIDE;
|
||||
// }
|
||||
// __syncthreads();
|
||||
|
||||
Index twiddle_shift = 1;
|
||||
int ii = 0;
|
||||
for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
|
||||
ii++;
|
||||
Index lane_mask = 1 << (l - 1);
|
||||
Index thread_mask = (1 << l) - 1;
|
||||
twiddle_shift <<= 1;
|
||||
|
||||
tid = threadIdx.x;
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F64x4_TO_F128x2(w, tid ^ lane_mask);
|
||||
u[i] = (u_stays_in_register) ? u[i] : w;
|
||||
v[i] = (u_stays_in_register) ? w : v[i];
|
||||
w = NEG_TWID(tid / lane_mask + twiddle_shift);
|
||||
|
||||
// w *= v[i];
|
||||
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
|
||||
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
|
||||
w.im);
|
||||
f128::cplx_f128_add_assign(u[i].re, u[i].im, u[i].re, u[i].im, w.re,
|
||||
w.im);
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// store registers in SM
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
F128x2_TO_F64x4(u[i], tid * 2);
|
||||
F128x2_TO_F64x4(v[i], (tid * 2 + 1));
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ void negacyclic_inverse_fft_f128(double *dt_re_hi, double *dt_re_lo,
|
||||
double *dt_im_hi,
|
||||
double *dt_im_lo) {
|
||||
__syncthreads();
|
||||
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
|
||||
constexpr Index LOG2_DEGREE = params::log2_degree;
|
||||
constexpr Index DEGREE = params::degree;
|
||||
constexpr Index HALF_DEGREE = params::degree >> 1;
|
||||
constexpr Index STRIDE = params::degree / params::opt;
|
||||
|
||||
size_t tid = threadIdx.x;
|
||||
f128x2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
|
||||
|
||||
// load into registers and divide by compressed polynomial size
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
|
||||
F64x4_TO_F128x2(u[i], 2 * tid);
|
||||
F64x4_TO_F128x2(v[i], 2 * tid + 1);
|
||||
|
||||
tid += STRIDE;
|
||||
}
|
||||
|
||||
Index twiddle_shift = DEGREE;
|
||||
for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
|
||||
Index lane_mask = 1 << (l - 1);
|
||||
Index thread_mask = (1 << l) - 1;
|
||||
tid = threadIdx.x;
|
||||
twiddle_shift >>= 1;
|
||||
|
||||
// at this point registers are ready for the butterfly
|
||||
tid = threadIdx.x;
|
||||
__syncthreads();
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
w = (u[i] - v[i]);
|
||||
u[i] += v[i];
|
||||
v[i] = w * NEG_TWID(tid / lane_mask + twiddle_shift).conjugate();
|
||||
|
||||
// keep one of the register for next iteration and store another one in sm
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F128x2_TO_F64x4((u_stays_in_register) ? v[i] : u[i], tid);
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// prepare registers for next butterfly iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
Index rank = tid & thread_mask;
|
||||
bool u_stays_in_register = rank < lane_mask;
|
||||
F64x4_TO_F128x2(w, tid ^ lane_mask);
|
||||
|
||||
u[i] = (u_stays_in_register) ? u[i] : w;
|
||||
v[i] = (u_stays_in_register) ? w : v[i];
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
}
|
||||
|
||||
// last iteration
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
|
||||
w = (u[i] - v[i]);
|
||||
u[i] = u[i] + v[i];
|
||||
v[i] = w * NEG_TWID(1).conjugate();
|
||||
}
|
||||
__syncthreads();
|
||||
// store registers in SM
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
|
||||
F128x2_TO_F64x4(u[i], tid);
|
||||
F128x2_TO_F64x4(v[i], tid + HALF_DEGREE);
|
||||
|
||||
tid = tid + STRIDE;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__device__ void convert_u128_to_f128_as_integer(
|
||||
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in_re, const __uint128_t *in_im) {
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; i++) {
|
||||
__syncthreads();
|
||||
auto out_re = u128_to_signed_to_f128(in_re[tid]);
|
||||
__syncthreads();
|
||||
auto out_im = u128_to_signed_to_f128(in_im[tid]);
|
||||
__syncthreads();
|
||||
|
||||
out_re_hi[tid] = out_re.hi;
|
||||
out_re_lo[tid] = out_re.lo;
|
||||
out_im_hi[tid] = out_im.hi;
|
||||
out_im_lo[tid] = out_im.lo;
|
||||
|
||||
// __syncthreads();
|
||||
// if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
// printf("%.5f %.5f %.5f %.5f\n", out_re_hi[tid], out_re_lo[tid],
|
||||
// out_im_hi[tid],
|
||||
// out_im_lo[tid]);
|
||||
// }
|
||||
// __syncthreads();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__device__ void convert_u128_to_f128_as_torus(
|
||||
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in_re, const __uint128_t *in_im) {
|
||||
|
||||
const double normalization = pow(2., -128.);
|
||||
Index tid = threadIdx.x;
|
||||
// #pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; i++) {
|
||||
__syncthreads();
|
||||
auto out_re = u128_to_signed_to_f128(in_re[tid]);
|
||||
__syncthreads();
|
||||
auto out_im = u128_to_signed_to_f128(in_im[tid]);
|
||||
__syncthreads();
|
||||
|
||||
out_re_hi[tid] = out_re.hi * normalization;
|
||||
out_re_lo[tid] = out_re.lo * normalization;
|
||||
out_im_hi[tid] = out_im.hi * normalization;
|
||||
out_im_lo[tid] = out_im.lo * normalization;
|
||||
|
||||
// __syncthreads();
|
||||
// if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
// printf("%.5f %.5f %.5f %.5f\n", out_re_hi[tid], out_re_lo[tid],
|
||||
// out_im_hi[tid],
|
||||
// out_im_lo[tid]);
|
||||
// }
|
||||
// __syncthreads();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__global__ void
|
||||
batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
|
||||
double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in) {
|
||||
|
||||
convert_u128_to_f128_as_integer<params>(
|
||||
&out_re_hi[blockIdx.x * params::degree / 2],
|
||||
&out_re_lo[blockIdx.x * params::degree / 2],
|
||||
&out_im_hi[blockIdx.x * params::degree / 2],
|
||||
&out_im_lo[blockIdx.x * params::degree / 2],
|
||||
&in[blockIdx.x * params::degree],
|
||||
&in[blockIdx.x * params::degree + params::degree / 2]);
|
||||
}
|
||||
|
||||
// params is expected to be full degree not half degree
|
||||
template <class params>
|
||||
__global__ void
|
||||
batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
|
||||
double *out_im_hi, double *out_im_lo,
|
||||
const __uint128_t *in) {
|
||||
|
||||
convert_u128_to_f128_as_torus<params>(
|
||||
&out_re_hi[blockIdx.x * params::degree / 2],
|
||||
&out_re_lo[blockIdx.x * params::degree / 2],
|
||||
&out_im_hi[blockIdx.x * params::degree / 2],
|
||||
&out_im_lo[blockIdx.x * params::degree / 2],
|
||||
&in[blockIdx.x * params::degree],
|
||||
&in[blockIdx.x * params::degree + params::degree / 2]);
|
||||
}
|
||||
|
||||
template <class params, sharedMemDegree SMD>
|
||||
__global__ void
|
||||
batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
|
||||
double *in_im_lo, double *out_re_hi, double *out_re_lo,
|
||||
double *out_im_hi, double *out_im_lo, double *buffer) {
|
||||
extern __shared__ double sharedMemoryFFT[];
|
||||
double *re_hi, *re_lo, *im_hi, *im_lo;
|
||||
|
||||
// debug
|
||||
__syncthreads();
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
printf("Params::degree %d\n", params::degree);
|
||||
printf("opt %d\n", params::opt);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
if (SMD == NOSM) {
|
||||
re_hi =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
|
||||
re_lo =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
|
||||
im_hi =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
|
||||
im_lo =
|
||||
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
|
||||
} else {
|
||||
re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
|
||||
re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
|
||||
im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
|
||||
im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
|
||||
}
|
||||
|
||||
Index tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; ++i) {
|
||||
re_hi[tid] = in_re_hi[blockIdx.x * (params::degree / 2) + tid];
|
||||
re_lo[tid] = in_re_lo[blockIdx.x * (params::degree / 2) + tid];
|
||||
im_hi[tid] = in_im_hi[blockIdx.x * (params::degree / 2) + tid];
|
||||
im_lo[tid] = in_im_lo[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
|
||||
__syncthreads();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (Index i = 0; i < params::opt / 2; ++i) {
|
||||
out_re_hi[blockIdx.x * (params::degree / 2) + tid] = re_hi[tid];
|
||||
out_re_lo[blockIdx.x * (params::degree / 2) + tid] = re_lo[tid];
|
||||
out_im_hi[blockIdx.x * (params::degree / 2) + tid] = im_hi[tid];
|
||||
out_im_lo[blockIdx.x * (params::degree / 2) + tid] = im_lo[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
void print_uint128_bits(__uint128_t value) {
|
||||
char buffer[129]; // 128 bits + null terminator
|
||||
buffer[128] = '\0'; // Null-terminate the string
|
||||
|
||||
for (int i = 127; i >= 0; --i) {
|
||||
buffer[i] = (value & 1) ? '1' : '0'; // Extract the least significant bit
|
||||
value >>= 1; // Shift right by 1 bit
|
||||
}
|
||||
|
||||
printf("%s\n", buffer);
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void host_fourier_transform_forward_as_integer_f128(
|
||||
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
|
||||
double *im0, double *im1, const __uint128_t *standard, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
|
||||
// for (int i = 0; i < N / 2; i++)
|
||||
// {
|
||||
// printf("%.10f\n", re0[i]);
|
||||
// }
|
||||
// printf("cpp_poly_host\n");
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// print_uint128_bits(standard[i]);
|
||||
// }
|
||||
// printf("check #1\n");
|
||||
|
||||
// for (int i = 0; i < 32; i++) {
|
||||
// standard[i + 32] = standard[i];
|
||||
// }
|
||||
|
||||
// allocate device buffers
|
||||
double *d_re0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_re1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
|
||||
N * sizeof(__uint128_t), stream, gpu_index);
|
||||
|
||||
// copy input into device
|
||||
cuda_memcpy_async_to_gpu(d_standard, standard, N * sizeof(__uint128_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// setup launch parameters
|
||||
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
|
||||
int grid_size = number_of_samples;
|
||||
int block_size = params::degree / params::opt;
|
||||
bool full_sm =
|
||||
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
|
||||
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
|
||||
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
|
||||
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// configure shared memory for batch fft kernel
|
||||
if (full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// convert u128 into 4 x double
|
||||
batch_convert_u128_to_f128_as_integer<params>
|
||||
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
|
||||
d_standard);
|
||||
|
||||
// call negacyclic 128 bit forward fft.
|
||||
if (full_sm) {
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
} else {
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
}
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// print_debug("re_hi", d_re0, 32);
|
||||
// print_debug("d_re_lo", d_re1, 32);
|
||||
// print_debug("d_im_hi", d_im0, 32);
|
||||
// print_debug("d_im_lo", d_im1, 32);
|
||||
|
||||
cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(re1, d_re1, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(im0, d_im0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(im1, d_im1, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
|
||||
cuda_drop_async(d_standard, stream, gpu_index);
|
||||
cuda_drop_async(d_re0, stream, gpu_index);
|
||||
cuda_drop_async(d_re1, stream, gpu_index);
|
||||
cuda_drop_async(d_im0, stream, gpu_index);
|
||||
cuda_drop_async(d_im1, stream, gpu_index);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// printf("params::degree: %d\n", params::degree);
|
||||
// printf("params::opt: %d\n", params::opt);
|
||||
// printf("N: %d\n", N);
|
||||
// for (int i = 0; i < N; i++)
|
||||
// {
|
||||
// printf("%s\n", to_string_128(standard[i]).c_str());
|
||||
// }
|
||||
//
|
||||
// for (int i = 0; i < N / 2; i++) {
|
||||
//// auto re = u128_to_signed_to_f128(standard[i]);
|
||||
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
|
||||
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
|
||||
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
|
||||
// }
|
||||
}
|
||||
|
||||
__global__ void print_twiddles(int N) {
|
||||
for (int i = 0; i < N / 2; i++) {
|
||||
printf("%.73f %.73f %.73f %.73f\n", neg_twiddles_re_hi[i],
|
||||
neg_twiddles_re_lo[i], neg_twiddles_im_hi[i], neg_twiddles_im_lo[i]);
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void print_c128(double *re0, double *re1, double *im0, double *im1,
|
||||
int N) {
|
||||
for (int i = 0; i < N / 2; i++) {
|
||||
printf("%.73f %.73f %.73f %.73f\n", re0[i], re1[i], im0[i], im1[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void host_fourier_transform_forward_as_torus_f128(
|
||||
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
|
||||
double *im0, double *im1, const __uint128_t *standard, const uint32_t N,
|
||||
const uint32_t number_of_samples) {
|
||||
|
||||
print_twiddles<<<1, 1>>>(N);
|
||||
cudaDeviceSynchronize();
|
||||
// for (int i = 0; i < N / 2; i++)
|
||||
// {
|
||||
// printf("%.10f\n", re0[i]);
|
||||
// }
|
||||
// printf("cpp_poly_host\n");
|
||||
// for (int i = 0; i < N; i++) {
|
||||
// print_uint128_bits(standard[i]);
|
||||
// }
|
||||
// printf("check #1\n");
|
||||
|
||||
// for (int i = 0; i < 32; i++) {
|
||||
// standard[i + 32] = standard[i];
|
||||
// }
|
||||
|
||||
// allocate device buffers
|
||||
double *d_re0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_re1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
|
||||
N * sizeof(__uint128_t), stream, gpu_index);
|
||||
|
||||
// copy input into device
|
||||
cuda_memcpy_async_to_gpu(d_standard, standard, N * sizeof(__uint128_t),
|
||||
stream, gpu_index);
|
||||
|
||||
// setup launch parameters
|
||||
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
|
||||
int grid_size = number_of_samples;
|
||||
int block_size = params::degree / params::opt;
|
||||
bool full_sm =
|
||||
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
|
||||
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
|
||||
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
|
||||
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// configure shared memory for batch fft kernel
|
||||
if (full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// convert u128 into 4 x double
|
||||
batch_convert_u128_to_f128_as_torus<params>
|
||||
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
|
||||
d_standard);
|
||||
print_c128<<<1, 1>>>(d_re0, d_re1, d_im0, d_im1, N);
|
||||
cudaDeviceSynchronize();
|
||||
// call negacyclic 128 bit forward fft.
|
||||
if (full_sm) {
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
} else {
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
|
||||
<<<grid_size, block_size, shared_memory_size, stream>>>(
|
||||
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
}
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// print_debug("re_hi", d_re0, 32);
|
||||
// print_debug("d_re_lo", d_re1, 32);
|
||||
// print_debug("d_im_hi", d_im0, 32);
|
||||
// print_debug("d_im_lo", d_im1, 32);
|
||||
|
||||
cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(re1, d_re1, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(im0, d_im0, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_cpu(im1, d_im1, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
|
||||
cuda_drop_async(d_standard, stream, gpu_index);
|
||||
cuda_drop_async(d_re0, stream, gpu_index);
|
||||
cuda_drop_async(d_re1, stream, gpu_index);
|
||||
cuda_drop_async(d_im0, stream, gpu_index);
|
||||
cuda_drop_async(d_im1, stream, gpu_index);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
// printf("params::degree: %d\n", params::degree);
|
||||
// printf("params::opt: %d\n", params::opt);
|
||||
// printf("N: %d\n", N);
|
||||
// for (int i = 0; i < N; i++)
|
||||
// {
|
||||
// printf("%s\n", to_string_128(standard[i]).c_str());
|
||||
// }
|
||||
|
||||
// for (int i = 0; i < N / 2; i++) {
|
||||
//// auto re = u128_to_signed_to_f128(standard[i]);
|
||||
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
|
||||
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
|
||||
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
|
||||
// }
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__host__ void host_fourier_transform_backward_as_torus_f128(
|
||||
cudaStream_t stream, uint32_t gpu_index, __uint128_t *standard,
|
||||
double const *re0, double const *re1, double const *im0, double const *im1,
|
||||
const uint32_t N, const uint32_t number_of_samples) {
|
||||
|
||||
// allocate device buffers
|
||||
double *d_re0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_re1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im0 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
double *d_im1 =
|
||||
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
|
||||
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
|
||||
N * sizeof(__uint128_t), stream, gpu_index);
|
||||
|
||||
// // copy input into device
|
||||
cuda_memcpy_async_to_gpu(d_re0, standard, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_re1, standard, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_im0, standard, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
cuda_memcpy_async_to_gpu(d_im1, standard, N / 2 * sizeof(double), stream,
|
||||
gpu_index);
|
||||
|
||||
// setup launch parameters
|
||||
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
|
||||
int grid_size = number_of_samples;
|
||||
int block_size = params::degree / params::opt;
|
||||
bool full_sm =
|
||||
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
|
||||
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
|
||||
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
|
||||
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
|
||||
|
||||
// configure shared memory for batch fft kernel
|
||||
if (full_sm) {
|
||||
check_cuda_error(cudaFuncSetAttribute(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
|
||||
check_cuda_error(cudaFuncSetCacheConfig(
|
||||
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// // convert u128 into 4 x double
|
||||
// batch_convert_u128_to_f128_as_torus<params><<<grid_size, block_size, 0,
|
||||
// stream>>>(
|
||||
// d_re0, d_re1, d_im0, d_im1, d_standard);
|
||||
|
||||
// call negacyclic 128 bit forward fft.
|
||||
// if (full_sm) {
|
||||
// negacyclic_inverse_fft_f128<FFTDegree<params, ForwardFFT>,
|
||||
// FULLSM><<<grid_size, block_size, shared_memory_size, stream>>>
|
||||
// (d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
// } else {
|
||||
// batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM><<<grid_size,
|
||||
// block_size, shared_memory_size, stream>>>
|
||||
// (d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
|
||||
//
|
||||
// }
|
||||
//
|
||||
// cudaDeviceSynchronize();
|
||||
|
||||
//// print_debug("re_hi", d_re0, 32);
|
||||
//// print_debug("d_re_lo", d_re1, 32);
|
||||
//// print_debug("d_im_hi", d_im0, 32);
|
||||
//// print_debug("d_im_lo", d_im1, 32);
|
||||
//
|
||||
|
||||
cuda_memcpy_async_to_cpu(standard, d_standard, N * sizeof(__uint128_t),
|
||||
stream, gpu_index);
|
||||
cuda_drop_async(d_standard, stream, gpu_index);
|
||||
cuda_drop_async(d_re0, stream, gpu_index);
|
||||
cuda_drop_async(d_re1, stream, gpu_index);
|
||||
cuda_drop_async(d_im0, stream, gpu_index);
|
||||
cuda_drop_async(d_im1, stream, gpu_index);
|
||||
|
||||
cudaDeviceSynchronize();
|
||||
//
|
||||
// printf("params::degree: %d\n", params::degree);
|
||||
// printf("params::opt: %d\n", params::opt);
|
||||
// printf("N: %d\n", N);
|
||||
// for (int i = 0; i < N; i++)
|
||||
// {
|
||||
// printf("%s\n", to_string_128(standard[i]).c_str());
|
||||
// }
|
||||
//
|
||||
// for (int i = 0; i < N / 2; i++) {
|
||||
//// auto re = u128_to_signed_to_f128(standard[i]);
|
||||
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
|
||||
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
|
||||
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
|
||||
// }
|
||||
//
|
||||
}
|
||||
|
||||
#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
|
||||
16387
backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu
Normal file
16387
backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cu
Normal file
File diff suppressed because it is too large
Load Diff
11
backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cuh
Normal file
11
backends/tfhe-cuda-backend/cuda/src/fft128/twiddles.cuh
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef GPU_BOOTSTRAP_128_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_128_TWIDDLES_CUH
|
||||
|
||||
/*
|
||||
* 'negtwiddles' are stored in device memory to profit caching
|
||||
*/
|
||||
extern __device__ double neg_twiddles_re_hi[4096];
|
||||
extern __device__ double neg_twiddles_re_lo[4096];
|
||||
extern __device__ double neg_twiddles_im_hi[4096];
|
||||
extern __device__ double neg_twiddles_im_lo[4096];
|
||||
#endif
|
||||
@@ -1238,6 +1238,45 @@ extern "C" {
|
||||
input_lwe_ciphertext_count: u32,
|
||||
);
|
||||
}
|
||||
extern "C" {
|
||||
pub fn cuda_fourier_transform_forward_as_torus_f128_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
re0: *mut ffi::c_void,
|
||||
re1: *mut ffi::c_void,
|
||||
im0: *mut ffi::c_void,
|
||||
im1: *mut ffi::c_void,
|
||||
standard: *const ffi::c_void,
|
||||
N: u32,
|
||||
number_of_samples: u32,
|
||||
);
|
||||
}
|
||||
extern "C" {
|
||||
pub fn cuda_fourier_transform_forward_as_integer_f128_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
re0: *mut ffi::c_void,
|
||||
re1: *mut ffi::c_void,
|
||||
im0: *mut ffi::c_void,
|
||||
im1: *mut ffi::c_void,
|
||||
standard: *const ffi::c_void,
|
||||
N: u32,
|
||||
number_of_samples: u32,
|
||||
);
|
||||
}
|
||||
extern "C" {
|
||||
pub fn cuda_fourier_transform_backward_as_torus_f128_async(
|
||||
stream: *mut ffi::c_void,
|
||||
gpu_index: u32,
|
||||
standard: *mut ffi::c_void,
|
||||
re0: *const ffi::c_void,
|
||||
re1: *const ffi::c_void,
|
||||
im0: *const ffi::c_void,
|
||||
im1: *const ffi::c_void,
|
||||
N: u32,
|
||||
number_of_samples: u32,
|
||||
);
|
||||
}
|
||||
extern "C" {
|
||||
pub fn cuda_fourier_polynomial_mul(
|
||||
stream: *mut ffi::c_void,
|
||||
|
||||
@@ -4,5 +4,6 @@
|
||||
#include "cuda/include/integer/integer.h"
|
||||
#include "cuda/include/keyswitch.h"
|
||||
#include "cuda/include/linear_algebra.h"
|
||||
#include "cuda/include/pbs/fft.h"
|
||||
#include "cuda/include/pbs/programmable_bootstrap.h"
|
||||
#include "cuda/include/pbs/programmable_bootstrap_multibit.h"
|
||||
|
||||
5
bench_log.txt
Normal file
5
bench_log.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
Finished `bench` profile [optimized] target(s) in 0.10s
|
||||
Running benches/integer/bench.rs (target/release/deps/integer_bench-f3741775ea815609)
|
||||
Benchmarking integer::cuda::div_rem/integer::cuda::div_rem::PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS:...
|
||||
Benchmarking integer::cuda::div_rem/integer::cuda::div_rem::PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS:...: Warming up for 3.0000 s
|
||||
Benchmarking integer::cuda::div_rem/integer::cuda::div_rem::PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS:...: Collecting 15 samples in estimated 100.21 s (240 iterations)
|
||||
653
branches.log
Normal file
653
branches.log
Normal file
@@ -0,0 +1,653 @@
|
||||
al/fix_scalar_shifts
|
||||
al/signed_scalar_div
|
||||
bb/signed_div
|
||||
bench/gpu/div_rem
|
||||
chore/gpu/sum_ciphertexts_comments
|
||||
chre/gpu/integer_mult_gpu_params
|
||||
cuda/sprint_demo_bench
|
||||
feat/cuda/abs
|
||||
feat/cuda/accumulate_tbc_with_registers
|
||||
feat/cuda/new_type_cast
|
||||
feat/cuda/new_type_cast_no_round
|
||||
feat/cuda/signed_div
|
||||
feat/cuda_fft/reduce_sm_io
|
||||
feat/cuda_fft/shuffle_csel
|
||||
feat/gpu/apply_lut_with_factor
|
||||
feat/gpu/cuda_vector_add
|
||||
feat/gpu/cufftdx
|
||||
feat/gpu/div
|
||||
feat/gpu/div_backup
|
||||
feat/gpu/div_backup2
|
||||
feat/gpu/div_backup2_cleanup
|
||||
feat/gpu/div_rem_update_info
|
||||
feat/gpu/fft128
|
||||
feat/gpu/fft_4_step
|
||||
feat/gpu/generate_last_block_inner_propagation
|
||||
feat/gpu/host_rotate_blocks
|
||||
feat/gpu/mul_boolean_optimization
|
||||
feat/gpu/overflow_add
|
||||
feat/gpu/overflowing_scalar_add_and_sub
|
||||
feat/gpu/overflowing_sub
|
||||
feat/gpu/propagate_single_carry_get_input_carries
|
||||
feat/gpu/reuse_pbs_memory
|
||||
feat/gpu/shifts_one_wave
|
||||
feat/gpu/signed_overflowing_add_or_sub
|
||||
feat/gpu/signed_overflowing_add_or_sub_with_cuda
|
||||
feat/gpu/signed_overflowing_sub
|
||||
feat/gpu/signed_overflowing_sub_with_cuda
|
||||
feat/gpu/signed_scalar_add
|
||||
feat/gpu/unsigned_overflowing_scalar_add
|
||||
fix/gpu/div_mod/refactor_memory
|
||||
fix/gpu/fix_terms_degree_bug
|
||||
* fix/gpu/full_carry_prop_noise_level
|
||||
fix/gpu/single_carry_propagation
|
||||
main
|
||||
new_fft/fft_wide
|
||||
sk/feat/cuda-fft
|
||||
tm/full-prop-clean
|
||||
remotes/origin/0.5.vector
|
||||
remotes/origin/HEAD -> origin/main
|
||||
remotes/origin/add-c-api-gpu
|
||||
remotes/origin/add-license-faq
|
||||
remotes/origin/add_glwe_keyswitch
|
||||
remotes/origin/al/add_mem_check
|
||||
remotes/origin/al/add_vm_targets_erc20
|
||||
remotes/origin/al/avoid_sync_broadcast_lut
|
||||
remotes/origin/al/backup_multi_gpu
|
||||
remotes/origin/al/balanced_decomposition
|
||||
remotes/origin/al/bench_multi_gpu_throughput
|
||||
remotes/origin/al/chore/measure_pbs_timings
|
||||
remotes/origin/al/chore/measure_scalar_shift_timings
|
||||
remotes/origin/al/ci_fixes
|
||||
remotes/origin/al/clean_unused_functions
|
||||
remotes/origin/al/cuda_backend_build
|
||||
remotes/origin/al/cuda_malloc
|
||||
remotes/origin/al/cufftdx
|
||||
remotes/origin/al/debug_gpu
|
||||
remotes/origin/al/debug_launch_failure
|
||||
remotes/origin/al/default_config_gpu
|
||||
remotes/origin/al/div_multi_gpu
|
||||
remotes/origin/al/div_perf_multi_gpu
|
||||
remotes/origin/al/erc20_workflows
|
||||
remotes/origin/al/fix_clippy_error
|
||||
remotes/origin/al/fix_compression
|
||||
remotes/origin/al/fix_full_prop
|
||||
remotes/origin/al/fix_gpu_index
|
||||
remotes/origin/al/fix_mul_mem
|
||||
remotes/origin/al/fix_scalar_shifts
|
||||
remotes/origin/al/fix_shift_rotate_buffer
|
||||
remotes/origin/al/fix_sm_size
|
||||
remotes/origin/al/fix_tests
|
||||
remotes/origin/al/fixes
|
||||
remotes/origin/al/gpu_array_hl_api
|
||||
remotes/origin/al/gpu_same_params_as_cpu
|
||||
remotes/origin/al/improve_cmux
|
||||
remotes/origin/al/improve_scalar_div
|
||||
remotes/origin/al/investigate_launch_failure_bug
|
||||
remotes/origin/al/investigate_rtx_bugs
|
||||
remotes/origin/al/investigate_scalar_add_sub_failures
|
||||
remotes/origin/al/l40_bench
|
||||
remotes/origin/al/long_run_tests
|
||||
remotes/origin/al/many_lut_test
|
||||
remotes/origin/al/multi_gpu_debug
|
||||
remotes/origin/al/multi_gpu_dispatch_lwe
|
||||
remotes/origin/al/multi_gpu_dispatch_lwe_alt
|
||||
remotes/origin/al/multi_gpu_doc
|
||||
remotes/origin/al/multi_gpu_fix_div
|
||||
remotes/origin/al/multi_gpu_refactor_rebased
|
||||
remotes/origin/al/natural_decompoisition_order
|
||||
remotes/origin/al/noise_test
|
||||
remotes/origin/al/nvidia-smi-in-workflows
|
||||
remotes/origin/al/nvidia-smi-in-workflows-alt
|
||||
remotes/origin/al/optimize_mul_mem
|
||||
remotes/origin/al/p2p_mutex
|
||||
remotes/origin/al/p5_bench_workflow
|
||||
remotes/origin/al/pfail_gpu
|
||||
remotes/origin/al/pin_ksk_host
|
||||
remotes/origin/al/pinned_host_mem_bsk
|
||||
remotes/origin/al/pinned_host_memory
|
||||
remotes/origin/al/reduce_memory_pressure_scalar_mul
|
||||
remotes/origin/al/refactor_lwe_chunk_size
|
||||
remotes/origin/al/remove_host_decoration
|
||||
remotes/origin/al/remove_mempool_acess
|
||||
remotes/origin/al/remove_ms_comp
|
||||
remotes/origin/al/remove_omp_div
|
||||
remotes/origin/al/reset_test_threads
|
||||
remotes/origin/al/restrict_number_of_gpus
|
||||
remotes/origin/al/reuse_mem_cmux
|
||||
remotes/origin/al/rust_binding
|
||||
remotes/origin/al/set_higher_max_shared_memory
|
||||
remotes/origin/al/signed_long_run_tests
|
||||
remotes/origin/al/signed_scalar_div
|
||||
remotes/origin/al/simplify_4090_bench
|
||||
remotes/origin/al/small_fixes
|
||||
remotes/origin/al/sum_ct_single_gpu
|
||||
remotes/origin/al/sum_ct_vec_multi_gpu
|
||||
remotes/origin/al/test_device_other_than_zero
|
||||
remotes/origin/al/troubleshoot_h100_tests
|
||||
remotes/origin/al/tuniform_params
|
||||
remotes/origin/al/use_byte_add
|
||||
remotes/origin/am/chore/01-to-02-keys
|
||||
remotes/origin/am/chore/05-to-06
|
||||
remotes/origin/am/chore/add-assert-for-lwe-dim-pbs
|
||||
remotes/origin/am/chore/add-noise-and-security-curves-crates
|
||||
remotes/origin/am/chore/bench-size
|
||||
remotes/origin/am/chore/bump-version
|
||||
remotes/origin/am/chore/bump-zk-version
|
||||
remotes/origin/am/chore/change-instance-gpu-build
|
||||
remotes/origin/am/chore/ci-auto-data-branch
|
||||
remotes/origin/am/chore/clippy-bug
|
||||
remotes/origin/am/chore/comp
|
||||
remotes/origin/am/chore/compress-1-bits
|
||||
remotes/origin/am/chore/container-metadata
|
||||
remotes/origin/am/chore/dep-updates
|
||||
remotes/origin/am/chore/do-not-run-fft-workflows-on-push
|
||||
remotes/origin/am/chore/doc-casting
|
||||
remotes/origin/am/chore/doc-link-fix
|
||||
remotes/origin/am/chore/docs-0-8
|
||||
remotes/origin/am/chore/expose-rng
|
||||
remotes/origin/am/chore/expose-rng-0.7
|
||||
remotes/origin/am/chore/expose-rng-0.8
|
||||
remotes/origin/am/chore/fix-fft-bench-parser
|
||||
remotes/origin/am/chore/fix-incorrec-comment
|
||||
remotes/origin/am/chore/fix-wasm-timeout
|
||||
remotes/origin/am/chore/investigate-wasm-again
|
||||
remotes/origin/am/chore/make-ks-level-order-consistent
|
||||
remotes/origin/am/chore/multi-bit-4-4
|
||||
remotes/origin/am/chore/multi-bit-test
|
||||
remotes/origin/am/chore/no-test-internal
|
||||
remotes/origin/am/chore/pk-ks-params-big-key
|
||||
remotes/origin/am/chore/python-web-driver
|
||||
remotes/origin/am/chore/remove-open-handle-detection
|
||||
remotes/origin/am/chore/remove-remaining-modular-std-dev
|
||||
remotes/origin/am/chore/remove-serde-derive-glwe-body
|
||||
remotes/origin/am/chore/remove-some-concrete-branding
|
||||
remotes/origin/am/chore/sampling-tool
|
||||
remotes/origin/am/chore/sampling-tool-karatsuba
|
||||
remotes/origin/am/chore/saved-ntt
|
||||
remotes/origin/am/chore/slack-notif-status
|
||||
remotes/origin/am/chore/tfhe-fft
|
||||
remotes/origin/am/chore/tfhe-ntt
|
||||
remotes/origin/am/chore/trivium-test
|
||||
remotes/origin/am/chore/update-nightly
|
||||
remotes/origin/am/chore/update-wasm-bench
|
||||
remotes/origin/am/chore/update-wop-parameters
|
||||
remotes/origin/am/chore/utils-shortint
|
||||
remotes/origin/am/chore/wop-bug
|
||||
remotes/origin/am/chore/zk-v2-manage-upper-bound-params
|
||||
remotes/origin/am/dev/forward-comp-as-0-5
|
||||
remotes/origin/am/doc/forward-compatibility
|
||||
remotes/origin/am/doc/migrate-data-to-0-5
|
||||
remotes/origin/am/docs-revamp
|
||||
remotes/origin/am/doctest_bug
|
||||
remotes/origin/am/doctest_bug_minify
|
||||
remotes/origin/am/exp/g4
|
||||
remotes/origin/am/feat/TUniform
|
||||
remotes/origin/am/feat/any-q-decomposition
|
||||
remotes/origin/am/feat/cpk-cast
|
||||
remotes/origin/am/feat/csk-decompress
|
||||
remotes/origin/am/feat/debug-pbs-count
|
||||
remotes/origin/am/feat/deparallelized-multi-bit
|
||||
remotes/origin/am/feat/forward-comp-as-0-5
|
||||
remotes/origin/am/feat/keyswitch-flexible-modulus
|
||||
remotes/origin/am/feat/keytricks-part1
|
||||
remotes/origin/am/feat/keytricks-part2
|
||||
remotes/origin/am/feat/keytricks-part3
|
||||
remotes/origin/am/feat/ks-with-scalar-change
|
||||
remotes/origin/am/feat/more-efficient-integer-list-expansion
|
||||
remotes/origin/am/feat/ntt-pbs
|
||||
remotes/origin/am/feat/par-ksk-gen
|
||||
remotes/origin/am/feat/prime-q-lwe-glwe-ggsw
|
||||
remotes/origin/am/feat/prime-q-support-part4
|
||||
remotes/origin/am/feat/prime-q-support-part5
|
||||
remotes/origin/am/feat/q
|
||||
remotes/origin/am/feat/raw-parts-for-ksk
|
||||
remotes/origin/am/feat/semver-trick
|
||||
remotes/origin/am/feat/updated-ntt-pbs
|
||||
remotes/origin/am/feat/v2-padding-bit
|
||||
remotes/origin/am/feat/zk-metadata
|
||||
remotes/origin/am/feat/zk-proofs
|
||||
remotes/origin/am/feat/zk-v1-padding
|
||||
remotes/origin/am/fix/balanced-decomposition-gpu
|
||||
remotes/origin/am/fix/pedro-mbb-wasm
|
||||
remotes/origin/am/fix/shortint-ksk-noise-level
|
||||
remotes/origin/am/fix/trivial-0-wop
|
||||
remotes/origin/am/fix/use-proper-stair-ks-dim
|
||||
remotes/origin/am/fix/wop-crt-lut-gen
|
||||
remotes/origin/am/hack/shortint-wop-return-keys
|
||||
remotes/origin/am/hack/tuniform
|
||||
remotes/origin/am/noise_test
|
||||
remotes/origin/am/refactor-ggsw-encryption-inputs-consistent
|
||||
remotes/origin/am/refactor/better-compressed-pk
|
||||
remotes/origin/am/refactor/bsk-pfpksk-match-decomposer-order
|
||||
remotes/origin/am/refactor/core-c-api
|
||||
remotes/origin/am/refactor/factorize-expansion-code
|
||||
remotes/origin/am/refactor/ggsw-encryption-inputs-consistent
|
||||
remotes/origin/am/refactor/non-native-decomposer
|
||||
remotes/origin/am/refactor/rng-fallibility
|
||||
remotes/origin/am/refactor/shortint-factorize-key-noise
|
||||
remotes/origin/am/refactor/shortint-params
|
||||
remotes/origin/am/refactor/simplify-shortint-decompression
|
||||
remotes/origin/am/refactor/use-natural-decomposition-order
|
||||
remotes/origin/am/saved/crt_wip
|
||||
remotes/origin/am/test/add-some-noise-formulas
|
||||
remotes/origin/am/test/malicious-booleans
|
||||
remotes/origin/am/test/many-lut
|
||||
remotes/origin/am/test/noise-checks
|
||||
remotes/origin/am/wip-f
|
||||
remotes/origin/am/wip/fft-sampling
|
||||
remotes/origin/am/wip/ntt
|
||||
remotes/origin/am/wip/pfail
|
||||
remotes/origin/am/wip/prime-q-greedy
|
||||
remotes/origin/am/wip/rng-next
|
||||
remotes/origin/am/wip/simulation
|
||||
remotes/origin/am/wip/sts-testing
|
||||
remotes/origin/am/wip/tensor-prod
|
||||
remotes/origin/am/wip/updated-q
|
||||
remotes/origin/am/wip/zk-bench
|
||||
remotes/origin/artifact_ccs_2024
|
||||
remotes/origin/bb/signed_div
|
||||
remotes/origin/bb/signed_overflow_add
|
||||
remotes/origin/bc/ci-estimator
|
||||
remotes/origin/bc/ci/estimator
|
||||
remotes/origin/bench/gpu/div_rem
|
||||
remotes/origin/bench/gpu/pbs
|
||||
remotes/origin/bench_trace_packing_keyswitch
|
||||
remotes/origin/better-conformance
|
||||
remotes/origin/bitnot-free
|
||||
remotes/origin/blockchain-usecase
|
||||
remotes/origin/blogpost_example_0.2
|
||||
remotes/origin/c-threading
|
||||
remotes/origin/cb/feat/add_glwe_tensor_product_and_lwe_trace_packing_keyswitch
|
||||
remotes/origin/cb/feat/support_large_custom_modulus_multiplication
|
||||
remotes/origin/chore/add-slsa-framework
|
||||
remotes/origin/chore/as_optimize_pks_ml
|
||||
remotes/origin/chore/gpu/4090_core_crypto_bench
|
||||
remotes/origin/chore/gpu/bench_gpu_multi_bit_gf_4_pbs
|
||||
remotes/origin/chore/gpu/bench_scratch
|
||||
remotes/origin/chore/gpu/cuda_vec
|
||||
remotes/origin/chore/gpu/fourier_bsk
|
||||
remotes/origin/chore/gpu/get_lwe_chunk_size_refactor
|
||||
remotes/origin/chore/gpu/lwechunksize_refactor
|
||||
remotes/origin/chore/gpu/measure_ks_time
|
||||
remotes/origin/chore/gpu/multibitpbs
|
||||
remotes/origin/chore/gpu/pbs_naming
|
||||
remotes/origin/chore/gpu/pbsstruct
|
||||
remotes/origin/chore/gpu/reduce_scratch_time
|
||||
remotes/origin/chore/gpu/reduce_sumctvec_memory
|
||||
remotes/origin/chore/gpu/remove_amortized_degree
|
||||
remotes/origin/chore/gpu/speedup_signed_comparisons
|
||||
remotes/origin/chore/gpu/sum_ciphertexts_comments
|
||||
remotes/origin/chore/multi_bit_pbs_test
|
||||
remotes/origin/chore/param
|
||||
remotes/origin/chre/gpu/integer_mult_gpu_params
|
||||
remotes/origin/ci/bench_big_machine
|
||||
remotes/origin/ci/bench_multi_bits
|
||||
remotes/origin/ci/parallel_bench
|
||||
remotes/origin/ci/select_benchmarks
|
||||
remotes/origin/ci/wip
|
||||
remotes/origin/clean_smart_ops
|
||||
remotes/origin/cuda/sprint_demo_bench
|
||||
remotes/origin/custom_modulus_trace_packing_KS_and_tensor_product
|
||||
remotes/origin/custom_modulus_trace_packing_KS_and_tensor_product_trivial_approach_decomposer
|
||||
remotes/origin/dd/fix/bls446-serialize
|
||||
remotes/origin/dependabot/github_actions/actions/checkout-4.1.5
|
||||
remotes/origin/dependabot/github_actions/actions/checkout-4.1.6
|
||||
remotes/origin/dependabot/github_actions/actions/checkout-4.2.0
|
||||
remotes/origin/dependabot/github_actions/actions/upload-artifact-4.3.1
|
||||
remotes/origin/dependabot/github_actions/rtCamp/action-slack-notify-2.3.1
|
||||
remotes/origin/dependabot/github_actions/tj-actions/changed-files-44.5.2
|
||||
remotes/origin/dependabot/github_actions/zama-ai/slab-github-runner-1.4.0
|
||||
remotes/origin/dependabot/github_actions/zama-ai/slab-github-runner-1dced74825027fe3d481392163ed8fc56813fb5d
|
||||
remotes/origin/dependabot/github_actions/zama-ai/slab-github-runner-2f49f88318c59d644bb6d329b3259d3ec48e1e34
|
||||
remotes/origin/dependabot/github_actions/zama-ai/slab-github-runner-da46a715fa5ed427dfd4f3e34126b06923c93231
|
||||
remotes/origin/dependabot/github_actions/zgosalvez/github-actions-ensure-sha-pinned-actions-3.0.15
|
||||
remotes/origin/dev
|
||||
remotes/origin/doc-merge-benchmarks-pages
|
||||
remotes/origin/doc-proofreading
|
||||
remotes/origin/doc/0.5.0/gpu
|
||||
remotes/origin/doc/0.5.0/overflow
|
||||
remotes/origin/doc/wip0.3.1
|
||||
remotes/origin/docs-revamp
|
||||
remotes/origin/dt/bench/bench_80b_integers
|
||||
remotes/origin/dt/bench/bench_ops_dedup
|
||||
remotes/origin/dt/bench/big_sizes_blockchain
|
||||
remotes/origin/dt/bench/crt_ops
|
||||
remotes/origin/dt/bench/fix_naming_gpu
|
||||
remotes/origin/dt/bench/fix_signed_scalar_naming
|
||||
remotes/origin/dt/bench/integer_on_4090
|
||||
remotes/origin/dt/bench/integer_tuniform_default
|
||||
remotes/origin/dt/bench/oprf
|
||||
remotes/origin/dt/bench/other_hardware
|
||||
remotes/origin/dt/bench/pbs128
|
||||
remotes/origin/dt/bench/pbs_various_perror
|
||||
remotes/origin/dt/bench/pbs_with_shortint
|
||||
remotes/origin/dt/bench/test_compact_keys
|
||||
remotes/origin/dt/bench/test_parameters_secu_and_pfail
|
||||
remotes/origin/dt/bench/throughput
|
||||
remotes/origin/dt/bench/throughput_heuristic
|
||||
remotes/origin/dt/bench/trivium_app
|
||||
remotes/origin/dt/bench/wasm_benchs
|
||||
remotes/origin/dt/chore/bench_full_0.4.1
|
||||
remotes/origin/dt/chore/update_params
|
||||
remotes/origin/dt/chore/update_tuniform_params
|
||||
remotes/origin/dt/ci/bench_gpu_v100
|
||||
remotes/origin/dt/ci/bench_pfks
|
||||
remotes/origin/dt/ci/cargo_deny
|
||||
remotes/origin/dt/ci/change_rust_action
|
||||
remotes/origin/dt/ci/check_labels
|
||||
remotes/origin/dt/ci/check_typos
|
||||
remotes/origin/dt/ci/coverage_tests
|
||||
remotes/origin/dt/ci/cuda_release
|
||||
remotes/origin/dt/ci/filter_schedule
|
||||
remotes/origin/dt/ci/fix_bench_parser
|
||||
remotes/origin/dt/ci/fix_firefox_wasm
|
||||
remotes/origin/dt/ci/fix_gpu_integer_bench
|
||||
remotes/origin/dt/ci/fix_slack_notif
|
||||
remotes/origin/dt/ci/fix_throughput_bench
|
||||
remotes/origin/dt/ci/fix_weekly_cuda_bench
|
||||
remotes/origin/dt/ci/gpu_bench_hyperstack
|
||||
remotes/origin/dt/ci/gpu_tests_separated
|
||||
remotes/origin/dt/ci/handle_forks
|
||||
remotes/origin/dt/ci/improve_core_crypto_coverage
|
||||
remotes/origin/dt/ci/improve_coverage_core
|
||||
remotes/origin/dt/ci/integer_coverage
|
||||
remotes/origin/dt/ci/lattice_estimator_update
|
||||
remotes/origin/dt/ci/multi_bit_gpu_bench
|
||||
remotes/origin/dt/ci/no_cancel_workflow
|
||||
remotes/origin/dt/ci/npm_labels
|
||||
remotes/origin/dt/ci/pcc_test_gpu
|
||||
remotes/origin/dt/ci/remove_params
|
||||
remotes/origin/dt/ci/remove_wasm_test
|
||||
remotes/origin/dt/ci/slab_gh_runner
|
||||
remotes/origin/dt/ci/test_multibit_params
|
||||
remotes/origin/dt/ci/test_on_changes
|
||||
remotes/origin/dt/ci/test_perfo
|
||||
remotes/origin/dt/ci/test_slab_action
|
||||
remotes/origin/dt/ci/update_new_workflows
|
||||
remotes/origin/dt/ci/use_large_runner
|
||||
remotes/origin/dt/ci/vector_sum_bench
|
||||
remotes/origin/dt/doc/gpu_bench_array
|
||||
remotes/origin/dt/refacto/rename_operations
|
||||
remotes/origin/dt/test/check_perf
|
||||
remotes/origin/dt/test/sanity_checks_tuniform
|
||||
remotes/origin/expose-traits
|
||||
remotes/origin/feat/as_generalize_gemm_pks_all_params
|
||||
remotes/origin/feat/cuda/abs
|
||||
remotes/origin/feat/cuda/multi_bit_tbc_registers
|
||||
remotes/origin/feat/cuda/new_type_cast
|
||||
remotes/origin/feat/cuda/new_type_cast_no_round
|
||||
remotes/origin/feat/cuda/signed_div
|
||||
remotes/origin/feat/cuda_fft/reduce_sm_io
|
||||
remotes/origin/feat/gpu/apply_lut_with_factor
|
||||
remotes/origin/feat/gpu/benchmark
|
||||
remotes/origin/feat/gpu/cuda_vector_add
|
||||
remotes/origin/feat/gpu/cufftdx
|
||||
remotes/origin/feat/gpu/div
|
||||
remotes/origin/feat/gpu/div_backup
|
||||
remotes/origin/feat/gpu/div_backup2
|
||||
remotes/origin/feat/gpu/div_backup2_cleanup
|
||||
remotes/origin/feat/gpu/div_backup_2
|
||||
remotes/origin/feat/gpu/div_rem_update_info
|
||||
remotes/origin/feat/gpu/fft128
|
||||
remotes/origin/feat/gpu/first_multi_gpu
|
||||
remotes/origin/feat/gpu/generate_last_block_inner_propagation
|
||||
remotes/origin/feat/gpu/host_rotate_blocks
|
||||
remotes/origin/feat/gpu/monokernel_multibitpbs
|
||||
remotes/origin/feat/gpu/mul_boolean_optimization
|
||||
remotes/origin/feat/gpu/multi_gpu
|
||||
remotes/origin/feat/gpu/multi_gpu_buffers
|
||||
remotes/origin/feat/gpu/overflow_add
|
||||
remotes/origin/feat/gpu/overflowing_scalar_add_and_sub
|
||||
remotes/origin/feat/gpu/overflowing_sub
|
||||
remotes/origin/feat/gpu/propagate_single_carry_get_input_carries
|
||||
remotes/origin/feat/gpu/refactorchaos
|
||||
remotes/origin/feat/gpu/reuse_pbs_memory
|
||||
remotes/origin/feat/gpu/scalar_mul
|
||||
remotes/origin/feat/gpu/scalareq
|
||||
remotes/origin/feat/gpu/shift_rotate
|
||||
remotes/origin/feat/gpu/shifts_one_wave
|
||||
remotes/origin/feat/gpu/signed
|
||||
remotes/origin/feat/gpu/signed_bitops
|
||||
remotes/origin/feat/gpu/signed_comparisons
|
||||
remotes/origin/feat/gpu/signed_eq_ne
|
||||
remotes/origin/feat/gpu/signed_if_then_else
|
||||
remotes/origin/feat/gpu/signed_mul
|
||||
remotes/origin/feat/gpu/signed_overflowing_add_or_sub
|
||||
remotes/origin/feat/gpu/signed_overflowing_add_or_sub_with_cuda
|
||||
remotes/origin/feat/gpu/signed_overflowing_sub
|
||||
remotes/origin/feat/gpu/signed_scalar_add
|
||||
remotes/origin/feat/gpu/signed_scalar_bitops
|
||||
remotes/origin/feat/gpu/signed_scalar_comparisons
|
||||
remotes/origin/feat/gpu/signed_scalar_shift
|
||||
remotes/origin/feat/gpu/signed_scalar_sub_mul
|
||||
remotes/origin/feat/gpu/signedmaxmin
|
||||
remotes/origin/feat/gpu/sm_multibit
|
||||
remotes/origin/feat/gpu/tbc
|
||||
remotes/origin/feat/gpu/threadblockcluster
|
||||
remotes/origin/feat/gpu/unsigned_cast
|
||||
remotes/origin/feat/gpu/unsigned_overflowing_scalar_add
|
||||
remotes/origin/feature/hl_casting_keys_2
|
||||
remotes/origin/feature/trivium
|
||||
remotes/origin/fix-addition-for-concrete
|
||||
remotes/origin/fix-comp
|
||||
remotes/origin/fix/bench/ks
|
||||
remotes/origin/fix/bench_parsing
|
||||
remotes/origin/fix/gpu/fix_terms_degree_bug
|
||||
remotes/origin/fix/gpu/full_carry_prop_noise_level
|
||||
remotes/origin/fix/gpu/maxsharedmemory
|
||||
remotes/origin/fix/gpu/single_carry_propagation
|
||||
remotes/origin/full-propa
|
||||
remotes/origin/gitbook-edit
|
||||
remotes/origin/go/chore/add-gpu-parameters-multibit-gf4
|
||||
remotes/origin/go/chore/add-lazy-loading-info-to-documentation
|
||||
remotes/origin/go/chore/add-nvtx-gpu
|
||||
remotes/origin/go/chore/gpu-new-bitnot
|
||||
remotes/origin/go/chore/gpu-sampling-tool
|
||||
remotes/origin/go/chore/implement-is-equal
|
||||
remotes/origin/go/chore/review-keyswitch
|
||||
remotes/origin/go/chore/review-opt-choice-on-h100-for-multi-bit-pbs
|
||||
remotes/origin/go/chore/simple-test-fftgf4
|
||||
remotes/origin/go/chore/update-classical-params-gpu
|
||||
remotes/origin/go/chore/update-gpu-parameters
|
||||
remotes/origin/go/chore/use-new-gpu-params
|
||||
remotes/origin/go/feature/implement-leading-trailing-zeros-and-one-tests
|
||||
remotes/origin/go/feature/implement-leading-trailing-zeros-and-ones
|
||||
remotes/origin/go/feature/implement-many-luts
|
||||
remotes/origin/go/feature/implement-sub-array-search
|
||||
remotes/origin/go/feature/implement-subarray-search-gpu
|
||||
remotes/origin/go/refactor/apply-overlapping-strategy-on-keybundle
|
||||
remotes/origin/go/refactor/improve-fft-gpu
|
||||
remotes/origin/go/refactor/improve-full-propagation-and-sum-algorithms
|
||||
remotes/origin/go/refactor/improve-keybundle-further
|
||||
remotes/origin/go/refactor/keybundle
|
||||
remotes/origin/go/refactor/specify-launch-bounds-on-kernels
|
||||
remotes/origin/go/refactor/understand-classical-pbs-bottlenecks
|
||||
remotes/origin/go/refactor/use-const-and-restrict-on-gpu
|
||||
remotes/origin/go/wip/improve-syncs
|
||||
remotes/origin/go/wip/pfail_gpu
|
||||
remotes/origin/gpu/hyperstack_ci
|
||||
remotes/origin/hlapi
|
||||
remotes/origin/hlapi-1
|
||||
remotes/origin/hlapi-clean
|
||||
remotes/origin/hlapi-fhe-bool-cuda
|
||||
remotes/origin/integer-prelude
|
||||
remotes/origin/integration/0.6/versioning
|
||||
remotes/origin/integration/main/versioning
|
||||
remotes/origin/integration/versioning
|
||||
remotes/origin/jb/experiments_pke
|
||||
remotes/origin/jb/tmp
|
||||
remotes/origin/jkl/agnes-noise_test-branch
|
||||
remotes/origin/jkl/fft-noise-model
|
||||
remotes/origin/joc_benches
|
||||
remotes/origin/kc1212/witness-dim
|
||||
remotes/origin/main
|
||||
remotes/origin/multibit-deterministic
|
||||
remotes/origin/mz/boolean_oprf
|
||||
remotes/origin/mz/cleanups
|
||||
remotes/origin/mz/clippy_hl_c_api
|
||||
remotes/origin/mz/clippy_trivium_bench
|
||||
remotes/origin/mz/conformnce_proven_cctl
|
||||
remotes/origin/mz/ct_mod_compression
|
||||
remotes/origin/mz/destruct_conformance
|
||||
remotes/origin/mz/doc_prf
|
||||
remotes/origin/mz/enable_all_doctest_warnings
|
||||
remotes/origin/mz/fix_ambiguity
|
||||
remotes/origin/mz/fix_apply_lookup_table_bivariate_check
|
||||
remotes/origin/mz/fix_clippy_doctests
|
||||
remotes/origin/mz/fix_fhe_strings
|
||||
remotes/origin/mz/fix_lint_build
|
||||
remotes/origin/mz/fix_max_noise_level
|
||||
remotes/origin/mz/fix_oom
|
||||
remotes/origin/mz/fix_prf_p_error
|
||||
remotes/origin/mz/fix_versionning
|
||||
remotes/origin/mz/glwe_ms_compression
|
||||
remotes/origin/mz/glwe_packing
|
||||
remotes/origin/mz/integer_compression_key
|
||||
remotes/origin/mz/just
|
||||
remotes/origin/mz/key_conformance
|
||||
remotes/origin/mz/max_noise_level_woppbs_parameters
|
||||
remotes/origin/mz/move_strings
|
||||
remotes/origin/mz/mpi
|
||||
remotes/origin/mz/mpi_no_priority
|
||||
remotes/origin/mz/ms_compression
|
||||
remotes/origin/mz/ms_compression_hl
|
||||
remotes/origin/mz/ms_compression_multi_bit
|
||||
remotes/origin/mz/ms_compression_shortint
|
||||
remotes/origin/mz/ms_compression_shortint_integer
|
||||
remotes/origin/mz/pack_in_carry
|
||||
remotes/origin/mz/refacto_wop_params
|
||||
remotes/origin/mz/refactor_decryption
|
||||
remotes/origin/mz/regroup_optionals_in_proofs
|
||||
remotes/origin/mz/remove_noise_CompactCt
|
||||
remotes/origin/mz/rename_LweCiphertextListParameters
|
||||
remotes/origin/mz/rename_multi_bit
|
||||
remotes/origin/mz/shortint/fix_degree
|
||||
remotes/origin/mz/stabilize_fhe_types
|
||||
remotes/origin/mz/string_server_key
|
||||
remotes/origin/mz/string_tests
|
||||
remotes/origin/mz/strings_function_executor
|
||||
remotes/origin/mz/test-compat
|
||||
remotes/origin/mz/test_shortint_lamellar
|
||||
remotes/origin/mz/update_toolchain
|
||||
remotes/origin/mz/use_secrets_in_ci
|
||||
remotes/origin/new-front-bivarfix
|
||||
remotes/origin/ns/0.6/doc_versioning
|
||||
remotes/origin/ns/0.6_with_versionize
|
||||
remotes/origin/ns/backport_0.6/fix_zk_32b
|
||||
remotes/origin/ns/backward_compat_hl
|
||||
remotes/origin/ns/bench_tfhe_zk_pok
|
||||
remotes/origin/ns/bench_zk_cs_ghl
|
||||
remotes/origin/ns/check_markdown_intra_links
|
||||
remotes/origin/ns/chore/params_zkv1
|
||||
remotes/origin/ns/chore/update_node
|
||||
remotes/origin/ns/clippy_versionable
|
||||
remotes/origin/ns/feat/integer_slice
|
||||
remotes/origin/ns/fix_bench_zk_pok
|
||||
remotes/origin/ns/ignore_data_repo_doc
|
||||
remotes/origin/ns/refactor_zk_tests
|
||||
remotes/origin/ns/release/0.7.2
|
||||
remotes/origin/ns/release_0.6.4
|
||||
remotes/origin/ns/safe_serialize_api
|
||||
remotes/origin/ns/test/bisect_pbs_fail
|
||||
remotes/origin/ns/test/data_automerge
|
||||
remotes/origin/ns/test/noise_check_perf
|
||||
remotes/origin/ns/test/noise_check_perf_revert
|
||||
remotes/origin/ns/test/pr_close
|
||||
remotes/origin/ns/test_data_lists
|
||||
remotes/origin/ns/test_rayon_bitslice
|
||||
remotes/origin/ns/test_type_breaking
|
||||
remotes/origin/ns/test_zk_bad_noise
|
||||
remotes/origin/ns/update_arkworks
|
||||
remotes/origin/ns/update_toolchain_2024-04
|
||||
remotes/origin/ns/upgrade_data_07
|
||||
remotes/origin/ns/versionable_unsupported
|
||||
remotes/origin/ns/versionize_arrays
|
||||
remotes/origin/ns/versionize_clean
|
||||
remotes/origin/ns/versionize_transparent
|
||||
remotes/origin/ns/versionize_zk
|
||||
remotes/origin/ns/zk_curve_conformance
|
||||
remotes/origin/ns/zk_test_bad_ct
|
||||
remotes/origin/ns/zk_test_bad_delta
|
||||
remotes/origin/ns/zk_test_wasm_x86
|
||||
remotes/origin/ntt-experiment
|
||||
remotes/origin/pa/chore/alt_pbs_chunk_size
|
||||
remotes/origin/pa/chore/cmux
|
||||
remotes/origin/pa/chore/improve_compression_test
|
||||
remotes/origin/pa/chore/log2
|
||||
remotes/origin/pa/chore/pbs
|
||||
remotes/origin/pa/feat/benchmark_packing_ks
|
||||
remotes/origin/pa/feat/compression
|
||||
remotes/origin/pa/fix/cmux
|
||||
remotes/origin/pa/fix/compression
|
||||
remotes/origin/pa/fix/compression_alt
|
||||
remotes/origin/pa/fix/multigpu_compression
|
||||
remotes/origin/pa/refactor/monokernel_tbc
|
||||
remotes/origin/pa/refactor/sampleextraction
|
||||
remotes/origin/readme-restructure
|
||||
remotes/origin/rebase_trace_packing_keyswitch
|
||||
remotes/origin/release/0.1.x
|
||||
remotes/origin/release/0.10.x
|
||||
remotes/origin/release/0.2.x
|
||||
remotes/origin/release/0.3.x
|
||||
remotes/origin/release/0.4.x
|
||||
remotes/origin/release/0.5.x
|
||||
remotes/origin/release/0.6.x
|
||||
remotes/origin/release/0.7.x
|
||||
remotes/origin/release/0.8.x
|
||||
remotes/origin/release/0.9.x
|
||||
remotes/origin/release_pr_stack
|
||||
remotes/origin/saved/all-decomp-formulas
|
||||
remotes/origin/saved/am/chore/dep-updates
|
||||
remotes/origin/saved/am/chore/release/0.7.x
|
||||
remotes/origin/saved/am/chore/update-wop-parameters
|
||||
remotes/origin/saved/mul-failure
|
||||
remotes/origin/sk/feat/batch-pbs
|
||||
remotes/origin/sk/feat/cuda-fft
|
||||
remotes/origin/sk/feat/ks-optimization
|
||||
remotes/origin/sk/feat/ntt-keygen
|
||||
remotes/origin/sk/feat/ntt-pbs
|
||||
remotes/origin/sk/feat/perf-modular-arithmetic
|
||||
remotes/origin/sk/feat/zk-pke-v2
|
||||
remotes/origin/sk/refactor/fft
|
||||
remotes/origin/smart-ops-hlapi
|
||||
remotes/origin/testing_lwe_keyswitch_solinas
|
||||
remotes/origin/tm/0.6-backports
|
||||
remotes/origin/tm/better-compact-list
|
||||
remotes/origin/tm/bind-match
|
||||
remotes/origin/tm/c-api-error
|
||||
remotes/origin/tm/compiler-inter-op
|
||||
remotes/origin/tm/count_ones
|
||||
remotes/origin/tm/cpk-param-getter
|
||||
remotes/origin/tm/erc20
|
||||
remotes/origin/tm/faster-carry-prop
|
||||
remotes/origin/tm/faster-scalar-add
|
||||
remotes/origin/tm/faster-shift
|
||||
remotes/origin/tm/fix-empty-prop
|
||||
remotes/origin/tm/fix-sub-signed-overflow
|
||||
remotes/origin/tm/full-prop-clean
|
||||
remotes/origin/tm/hl-bigger-types
|
||||
remotes/origin/tm/hlapi-compressed
|
||||
remotes/origin/tm/hlapi-gpu-tests
|
||||
remotes/origin/tm/hlstrings
|
||||
remotes/origin/tm/new-sum
|
||||
remotes/origin/tm/noise-asserts
|
||||
remotes/origin/tm/pub-reexports
|
||||
remotes/origin/tm/reduce-test-per-params
|
||||
remotes/origin/tm/safer-eq-ne
|
||||
remotes/origin/tm/scalar-cmps
|
||||
remotes/origin/tm/shorting-scalar-div
|
||||
remotes/origin/tm/shortint-encoding
|
||||
remotes/origin/tm/tensor-hlapi
|
||||
remotes/origin/tm/test-num-blocks
|
||||
remotes/origin/tm/unsigned-cmps
|
||||
remotes/origin/tm/vector-contains-sub
|
||||
remotes/origin/tm/vector-hlapi
|
||||
remotes/origin/tm/zk-conincal-serde
|
||||
remotes/origin/tmp_ccs
|
||||
remotes/origin/tore/internal-val-access
|
||||
remotes/origin/trace_packing_keyswitch
|
||||
remotes/origin/trace_packing_ks_benchmarks
|
||||
remotes/origin/updated_trace_packing_for_native_ciphertext_modulus
|
||||
remotes/origin/wip-compile-error-no-release
|
||||
remotes/origin/yuxizama-patch-1
|
||||
4
cplx_mul.log
Normal file
4
cplx_mul.log
Normal file
@@ -0,0 +1,4 @@
|
||||
-224091083899144574993365467136.000000 15330585716229.072266 224091083972931551288203673600.000000 -15306289712228.677734
|
||||
-224091083899144574993365467136.00000 15331705505758.05664 224091083899144574993365467136.00000 -15307409501759.66211
|
||||
224091083899144574993365467136.00000 -15307409501759.66211 224091083899144574993365467136.00000 -15331705505758.05664
|
||||
224091083899144574993365467136.00000 -15307409501759.66211 224091083899144574993365467136.00000 -15331705505758.05664
|
||||
115
cpu.log
Normal file
115
cpu.log
Normal file
@@ -0,0 +1,115 @@
|
||||
-0.3309726757578757561084614735591458156704902648925781250000000000000000000000000000000000000000000000 -0.0000000000000000262161441032774636130079215314589595089813221258498096566391666328854626044631004333 0.0380704474245701987578094360742397839203476905822753906250000000000000000000000000000000000000000000 -0.0000000000000000009616181682828339618607862290825046798652604462695437856356583949946070788428187370
|
||||
-0.1992581793808917978161332484887680038809776306152343750000000000000000000000000000000000000000000000 -0.0000000000000000075339409234980193504176368576269248248756960902950199909788864260917762294411659241 0.0176522535578225135688423819146919413469731807708740234375000000000000000000000000000000000000000000 -0.0000000000000000011079942189849169212186822638096097214445589711721970246904866996828786795958876610
|
||||
0.2260550165243644937795863825158448889851570129394531250000000000000000000000000000000000000000000000 0.0000000000000000126022745915227752500212969900318988585096703503442110150256638689825194887816905975 -0.0799689927978955578113939850481983739882707595825195312500000000000000000000000000000000000000000000 -0.0000000000000000063460023733636283936840017401873121454740939053999759789093104700441472232341766357
|
||||
0.2426504064692105999689886175474384799599647521972656250000000000000000000000000000000000000000000000 0.0000000000000000004677977136086304752259424595325139275587325315104779248354116560904003563337028027 0.0365333979932580613092518717621715040877461433410644531250000000000000000000000000000000000000000000 0.0000000000000000005709344770022019545359771749790997852063776745568752721271543748571275500580668449
|
||||
0.0914749448999244568758726359192223753780126571655273437500000000000000000000000000000000000000000000 -0.0000000000000000036609340702821524448717294490586665457459787183420747008755213869335420895367860794 0.2786308216747127586288002021319698542356491088867187500000000000000000000000000000000000000000000000 0.0000000000000000041379918916404281616895809751333352103206111649103329797050321303686359897255897522
|
||||
-0.0124926024286186147793653589133100467734038829803466796875000000000000000000000000000000000000000000 -0.0000000000000000006937190272456474979519993342187765906642550802765293666007906381310021970421075821 0.2110491012472701377600259320388431660830974578857421875000000000000000000000000000000000000000000000 -0.0000000000000000051785532317672362668742834231741586121061746167014080977075707323820097371935844421
|
||||
-0.0494172631142162574469089975082169985398650169372558593750000000000000000000000000000000000000000000 0.0000000000000000020700342936472428610421033900271927805735103256831817347682900276595319155603647232 -0.2029124738196469157625045909298933111131191253662109375000000000000000000000000000000000000000000000 0.0000000000000000032892946516813893505437931152017216374523186022500384484423818776122061535716056824
|
||||
0.0315092344667192991725102046984829939901828765869140625000000000000000000000000000000000000000000000 0.0000000000000000026511341643412592805126523014407753588898695525533522748817816250266332644969224930 -0.2988676928669205534738750884571345522999763488769531250000000000000000000000000000000000000000000000 -0.0000000000000000223519185626510679759809968042136173580417081359914335814131902679946506395936012268
|
||||
-0.1672368467298269156806611590582178905606269836425781250000000000000000000000000000000000000000000000 -0.0000000000000000048162869081133674839304134386709639116236975270920334102342508231231477111577987671 0.2727747087939790260335826133086811751127243041992187500000000000000000000000000000000000000000000000 -0.0000000000000000085823320193215261882172073504002543651078135111842128124326478655348182655870914459
|
||||
-0.1420260232345169348899815986442263238131999969482421875000000000000000000000000000000000000000000000 0.0000000000000000078353130752317586555795127350300028590782462991921192285715846992388833314180374146 0.1452463415301429028314572633462375961244106292724609375000000000000000000000000000000000000000000000 -0.0000000000000000130538715628498043839887390457914535898550603454414467674560285104234935715794563293
|
||||
0.1033287219076595153666175974649377167224884033203125000000000000000000000000000000000000000000000000 0.0000000000000000065787245791957383370689888773812031301831047645908498444633494273148244246840476990 -0.1956877991027157481784826131843146868050098419189453125000000000000000000000000000000000000000000000 -0.0000000000000000108434329174532879698514604527317036512444788844025493623757228078829939477145671844
|
||||
0.2183523172540651480222351210613851435482501983642578125000000000000000000000000000000000000000000000 -0.0000000000000000042952334406807644211423904079848498142548976906260858876773767178747220896184444427 -0.1611283900775630495427748201109352521598339080810546875000000000000000000000000000000000000000000000 0.0000000000000000049035193854080055673983422185833393998012569040749301929604797578576835803687572479
|
||||
0.2397036106352415651965515053234412334859371185302734375000000000000000000000000000000000000000000000 -0.0000000000000000052953977110488860650721110650147839714434454694985676681540098797995597124099731445 0.1115723403910117567106041747138078790158033370971679687500000000000000000000000000000000000000000000 -0.0000000000000000050113374213532207659401750409473001366666780484688051289232646468008169904351234436
|
||||
0.1398900907499183998705660769701353274285793304443359375000000000000000000000000000000000000000000000 0.0000000000000000025854512863426880011882823985481452679392879801315019427931929385522380471229553223 0.1766487032839706705633631145246908999979496002197265625000000000000000000000000000000000000000000000 0.0000000000000000047294440057263980645363900766920231270194547596580608023186798050119250547140836716
|
||||
-0.1644190411459663658799712493419065140187740325927734375000000000000000000000000000000000000000000000 -0.0000000000000000061410496854399099844472379531271533022912420237207631323883560980902984738349914551 -0.1171702953286619941497548325060051865875720977783203125000000000000000000000000000000000000000000000 0.0000000000000000018800082941866710238088777334985347003193960609764186220349557743247714824974536896
|
||||
-0.2271417216519029391719186605769209563732147216796875000000000000000000000000000000000000000000000000 -0.0000000000000000003410581758850931755798386799378379375470085294421492688732300280207709874957799911 -0.2324424613666220273966445120095158927142620086669921875000000000000000000000000000000000000000000000 0.0000000000000000052705161220801475140341690423664438031522828137140877780009162734131678007543087006
|
||||
-0.2768098499713873561489663188694976270198822021484375000000000000000000000000000000000000000000000000 0.0000000000000000271839307303537356402336171088715461786860875246282093220528963684046175330877304077 0.1777761056245066584313718749399413354694843292236328125000000000000000000000000000000000000000000000 0.0000000000000000031436349524061540422500698511335631041026594357601451484862664642605523113161325455
|
||||
-0.1816506822808992283047047067157109268009662628173828125000000000000000000000000000000000000000000000 -0.0000000000000000091744046880454127720279131854420783978544864776692377428268798666977090761065483093 0.0859671095812945679215388850025192368775606155395507812500000000000000000000000000000000000000000000 -0.0000000000000000019790221122024815278620829931698194201352001468739022493892321108432952314615249634
|
||||
0.1733917003068170814561455017610569484531879425048828125000000000000000000000000000000000000000000000 -0.0000000000000000002355669226462113621703291114551109740391745850677726603628059365291846916079521179 -0.1507412427231122065141022403622628189623355865478515625000000000000000000000000000000000000000000000 -0.0000000000000000050074860604269868780685029482838904707624616828879823837716855905455304309725761414
|
||||
0.2504400589351173378460657659161370247602462768554687500000000000000000000000000000000000000000000000 -0.0000000000000000105495846440776700797075650436594384534827739945562347090302068863820750266313552856 -0.0608093817831304034315920148401346523314714431762695312500000000000000000000000000000000000000000000 -0.0000000000000000006649734888645786681087537662594592372951811360499638214882622833101777359843254089
|
||||
0.1862075376662823411599845258024288341403007507324218750000000000000000000000000000000000000000000000 0.0000000000000000092738192415434320391903987162228447654735493063692976234424492076868773438036441803 0.2071792938507401871106594626326113939285278320312500000000000000000000000000000000000000000000000000 0.0000000000000000037279901254371484709180590427832381461048739621779671976620207374253368470817804337
|
||||
0.0649597768658507629391252180539595428854227066040039062500000000000000000000000000000000000000000000 -0.0000000000000000000599891608510439561166750358405470603488327470983847156116297583139385096728801727 0.2077132880219826616130518459613085724413394927978515625000000000000000000000000000000000000000000000 0.0000000000000000109365476335817531496040397320749442280186865305083299126920337585033848881721496582
|
||||
-0.1149666930854301022391084075024991761893033981323242187500000000000000000000000000000000000000000000 -0.0000000000000000046719677244762014686309320217246739870689257903384195014240276577766053378582000732 -0.1693829849301254908411351607355754822492599487304687500000000000000000000000000000000000000000000000 0.0000000000000000019290098218370279959780502759588964735884392347577739185160439205901639070361852646
|
||||
-0.1013849860233902999695487778808455914258956909179687500000000000000000000000000000000000000000000000 -0.0000000000000000010958363504292690122307333050960276846345426518287025779985199847033072728663682938 -0.2972510798575715429414856316725490614771842956542968750000000000000000000000000000000000000000000000 0.0000000000000000151060681535435571571128080956451898860237590587266010011546768510015681385993957520
|
||||
-0.0338490332315699790632912424825917696580290794372558593750000000000000000000000000000000000000000000 -0.0000000000000000032112051133747497270867287155696078651722246333124338724118729260226245969533920288 0.3056514274567712630137350515724392607808113098144531250000000000000000000000000000000000000000000000 -0.0000000000000000084660885661200502048130129231798305394126263491287077977975883413819246925413608551
|
||||
-0.0838544664005262951889108080649748444557189941406250000000000000000000000000000000000000000000000000 -0.0000000000000000051800512586075403991650040606243722019399192612806870350539156788727268576622009277 0.1888297106619587217846145676958258263766765594482421875000000000000000000000000000000000000000000000 0.0000000000000000125710005198254852176108184946779873729544425121429709890641390757082263007760047913
|
||||
0.0259785877288864451573058289568507461808621883392333984375000000000000000000000000000000000000000000 0.0000000000000000003808995729306562665530126889433911280323679558962660096632646400394150987267494202 -0.2127655593344512374720522984716808423399925231933593750000000000000000000000000000000000000000000000 -0.0000000000000000019935625096555455247881784143722497167520067235186101428645599753508577123284339905
|
||||
0.1435981506670927776436741396537399850785732269287109375000000000000000000000000000000000000000000000 0.0000000000000000131126020340999405450862544727246234289488926316605624244893135710299247875809669495 -0.2472170630286313064427616836837842129170894622802734375000000000000000000000000000000000000000000000 -0.0000000000000000122808089510788868829075694614557771638870714467047309814695665863837348297238349915
|
||||
0.2511343731265572865041235672833863645792007446289062500000000000000000000000000000000000000000000000 0.0000000000000000092452280298070285367329115251744832217129003532020067868124613141844747588038444519 0.0108656958033641493771570551984950725454837083816528320312500000000000000000000000000000000000000000 -0.0000000000000000002947488139454235324126363051764372718819140216213133855183148313017227337695658207
|
||||
0.2025455324622073560725254992576083168387413024902343750000000000000000000000000000000000000000000000 0.0000000000000000092268658403218626649354055433850145426009984179994624642340284026431618258357048035 0.1183192308221643201937567368986492510885000228881835937500000000000000000000000000000000000000000000 -0.0000000000000000033112219949499888538350744600518247336266158650688439663012374580830510240048170090
|
||||
-0.1933471753078349342214181660892791114747524261474609375000000000000000000000000000000000000000000000 0.0000000000000000056708946471137426758117517411030692033211668293211232816375400034303311258554458618 -0.0524965901614871180425048180495650740340352058410644531250000000000000000000000000000000000000000000 0.0000000000000000026002120264691327416023942568955159965262954086960469485267921641025168355554342270
|
||||
-0.3123928209210611317558914379333145916461944580078125000000000000000000000000000000000000000000000000 0.0000000000000000259207635313622839687841546274592684607857754166323344113820326128916349261999130249 -0.1116379705409853706488121360962395556271076202392578125000000000000000000000000000000000000000000000 0.0000000000000000066179487727938613466908117740857190639118025083514895712255565740633755922317504883
|
||||
32768
|
||||
340282366920938463463374607431768170496
|
||||
340282366920938463463374607431768080384
|
||||
340282366920938463463374607431768023040
|
||||
340282366920938463463374607431768092672
|
||||
340282366920938463463374607431768047616
|
||||
340282366920938463463374607431767556096
|
||||
71168
|
||||
277888
|
||||
1262592
|
||||
4018176
|
||||
16760832
|
||||
67108864
|
||||
268304384
|
||||
1074266112
|
||||
4295099124
|
||||
17180026739
|
||||
68719574034
|
||||
274877809558
|
||||
1099511650955
|
||||
4398046581374
|
||||
17592185995320
|
||||
70368743494672
|
||||
281474976801920
|
||||
1125899906931200
|
||||
4503599627216896
|
||||
18014398509543424
|
||||
72057594037780480
|
||||
288230376151842816
|
||||
1152921504606584832
|
||||
4611686018426339328
|
||||
18446744073709455341
|
||||
73786976294838222848
|
||||
295147905179352834048
|
||||
1180591620717411139584
|
||||
4722366482869645363200
|
||||
18889465931478580777984
|
||||
75557863725914323349504
|
||||
302231454903657293758464
|
||||
1208925819614629174814720
|
||||
4835703278458516698808960
|
||||
19342813113834066795323392
|
||||
77371252455336267180941312
|
||||
309485009821345068724764672
|
||||
1237940039285380274898993152
|
||||
4951760157141521099596627968
|
||||
19807040628566084398385463296
|
||||
79228162514264337593544017586
|
||||
316912650057057350374175694614
|
||||
1267650600228229401496703230278
|
||||
5070602400912917605986813129648
|
||||
20282409603651670423947251205115
|
||||
81129638414606681695789005047098
|
||||
324518553658426726783156020669176
|
||||
1298074214633706907132624082931600
|
||||
5192296858534827628530496329081216
|
||||
20769187434139310514121985316977152
|
||||
83076749736557242056487941267441664
|
||||
332306998946228968225951765070360576
|
||||
1329227995784915872903807060280262656
|
||||
5316911983139663491615228241121247232
|
||||
21267647932558653966460912964485251072
|
||||
85070591730234615865843651857941004288
|
||||
58358
|
||||
test core_crypto::fft_impl::fft128::math::fft::tests::test_roundtrip_u128 ... ok
|
||||
|
||||
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 345 filtered out; finished in 0.01s
|
||||
|
||||
|
||||
running 0 tests
|
||||
|
||||
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
|
||||
|
||||
|
||||
running 0 tests
|
||||
|
||||
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
|
||||
|
||||
|
||||
running 0 tests
|
||||
|
||||
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 211 filtered out; finished in 0.01s
|
||||
|
||||
221
gpu.log
Normal file
221
gpu.log
Normal file
@@ -0,0 +1,221 @@
|
||||
0.7071067811865475727373109293694142252206802368164062500000000000000000000 -0.0000000000000000483364665672645672552734986488347382506490835271488470948 0.7071067811865475727373109293694142252206802368164062500000000000000000000 -0.0000000000000000483364665672645672552734986488347382506490835271488470948
|
||||
0.9238795325112867384831361050601117312908172607421875000000000000000000000 0.0000000000000000176450470843366770599569725809896737820548554530070833879 0.3826834323650897817792326804919866845011711120605468750000000000000000000 -0.0000000000000000100507726964615876116866635215079604149716565502162279877
|
||||
-0.3826834323650897817792326804919866845011711120605468750000000000000000000 0.0000000000000000100507726964615876116866635215079604149716565502162279877 0.9238795325112867384831361050601117312908172607421875000000000000000000000 0.0000000000000000176450470843366770599569725809896737820548554530070833879
|
||||
0.9807852804032304305792422383092343807220458984375000000000000000000000000 0.0000000000000000185469399978250057259083323855314298012062932205111162598 0.1950903220161282758393639369387528859078884124755859375000000000000000000 -0.0000000000000000079910790684617312634428970369307223484248965070806183797
|
||||
-0.1950903220161282758393639369387528859078884124755859375000000000000000000 0.0000000000000000079910790684617312634428970369307223484248965070806183797 0.9807852804032304305792422383092343807220458984375000000000000000000000000 0.0000000000000000185469399978250057259083323855314298012062932205111162598
|
||||
0.5555702330196021776487214083317667245864868164062500000000000000000000000 0.0000000000000000470941094056167682138404887426404929202505139640422271441 0.8314696123025452356714026791451033204793930053710937500000000000000000000 0.0000000000000000014073856984728023893078524226353211699334975107286878349
|
||||
-0.8314696123025452356714026791451033204793930053710937500000000000000000000 -0.0000000000000000014073856984728023893078524226353211699334975107286878349 0.5555702330196021776487214083317667245864868164062500000000000000000000000 0.0000000000000000470941094056167682138404887426404929202505139640422271441
|
||||
0.9951847266721969287317506314138881862163543701171875000000000000000000000 -0.0000000000000000424869136783044157591677787910035744642531197693552380557 0.0980171403295606036287779261328978464007377624511718750000000000000000000 -0.0000000000000000016345823622442606095622288086432645091240042271949622910
|
||||
-0.0980171403295606036287779261328978464007377624511718750000000000000000000 0.0000000000000000016345823622442606095622288086432645091240042271949622910 0.9951847266721969287317506314138881862163543701171875000000000000000000000 -0.0000000000000000424869136783044157591677787910035744642531197693552380557
|
||||
0.6343932841636454877942696839454583823680877685546875000000000000000000000 0.0000000000000000104209019292800237913543214649613418305006117285573669085 0.7730104533627369933768136434082407504320144653320312500000000000000000000 -0.0000000000000000325659070336497908244691644422883020460214305129287293283
|
||||
-0.7730104533627369933768136434082407504320144653320312500000000000000000000 0.0000000000000000325659070336497908244691644422883020460214305129287293283 0.6343932841636454877942696839454583823680877685546875000000000000000000000 0.0000000000000000104209019292800237913543214649613418305006117285573669085
|
||||
0.8819212643483550495560052695509511977434158325195312500000000000000000000 -0.0000000000000000198432484058905683070896551742611855256204741109827063839 0.4713967368259976420397094898362411186099052429199218750000000000000000000 0.0000000000000000065166781360690129644720854136527256367167864498925970024
|
||||
-0.4713967368259976420397094898362411186099052429199218750000000000000000000 -0.0000000000000000065166781360690129644720854136527256367167864498925970024 0.8819212643483550495560052695509511977434158325195312500000000000000000000 -0.0000000000000000198432484058905683070896551742611855256204741109827063839
|
||||
0.2902846772544623865641710835916455835103988647460937500000000000000000000 -0.0000000000000000189279787077742607055779713593855819062153334180614860927 0.9569403357322088243819280251045711338520050048828125000000000000000000000 0.0000000000000000405538698618756882243805365526526472088078930809326716478
|
||||
-0.9569403357322088243819280251045711338520050048828125000000000000000000000 -0.0000000000000000405538698618756882243805365526526472088078930809326716478 0.2902846772544623865641710835916455835103988647460937500000000000000000000 -0.0000000000000000189279787077742607055779713593855819062153334180614860927
|
||||
0.9987954562051724050064649418345652520656585693359375000000000000000000000 -0.0000000000000000122916933370754648023069228195440464413706083235768885820 0.0490676743274180149345653489945107139647006988525390625000000000000000000 -0.0000000000000000006796103720518276261450983336612892861148698896541419864
|
||||
-0.0490676743274180149345653489945107139647006988525390625000000000000000000 0.0000000000000000006796103720518276261450983336612892861148698896541419864 0.9987954562051724050064649418345652520656585693359375000000000000000000000 -0.0000000000000000122916933370754648023069228195440464413706083235768885820
|
||||
0.6715589548470184411144145997241139411926269531250000000000000000000000000 -0.0000000000000000404890377492966801319705884961725462258731885432927466439 0.7409511253549591058842338497925084084272384643554687500000000000000000000 -0.0000000000000000147086169522973421017060338640487283358362467376636434357
|
||||
-0.7409511253549591058842338497925084084272384643554687500000000000000000000 0.0000000000000000147086169522973421017060338640487283358362467376636434357 0.6715589548470184411144145997241139411926269531250000000000000000000000000 -0.0000000000000000404890377492966801319705884961725462258731885432927466439
|
||||
0.9039892931234433381959547659789677709341049194335937500000000000000000000 -0.0000000000000000066097544687484293087601477313978651325255137129839264065 0.4275550934302820849097770405933260917663574218750000000000000000000000000 0.0000000000000000094111898162954679947731789440894191883937120943204313051
|
||||
-0.4275550934302820849097770405933260917663574218750000000000000000000000000 -0.0000000000000000094111898162954679947731789440894191883937120943204313051 0.9039892931234433381959547659789677709341049194335937500000000000000000000 -0.0000000000000000066097544687484293087601477313978651325255137129839264065
|
||||
0.3368898533922200511092626129538984969258308410644531250000000000000000000 -0.0000000000000000004200094003347528494601431516305399984771095349699076074 0.9415440651830208063088889502978418022394180297851562500000000000000000000 -0.0000000000000000278963795476983349094016418681950862291293519244599063978
|
||||
-0.9415440651830208063088889502978418022394180297851562500000000000000000000 0.0000000000000000278963795476983349094016418681950862291293519244599063978 0.3368898533922200511092626129538984969258308410644531250000000000000000000 -0.0000000000000000004200094003347528494601431516305399984771095349699076074
|
||||
0.9700312531945439742386838588572572916746139526367187500000000000000000000 0.0000000000000000183653003484288443908920092041031891826103053310748416949 0.2429801799032638986997056917971349321305751800537109375000000000000000000 -0.0000000000000000087514315297196631565769806659755604546308109536762592784
|
||||
-0.2429801799032638986997056917971349321305751800537109375000000000000000000 0.0000000000000000087514315297196631565769806659755604546308109536762592784 0.9700312531945439742386838588572572916746139526367187500000000000000000000 0.0000000000000000183653003484288443908920092041031891826103053310748416949
|
||||
0.5141027441932217723064013625844381749629974365234375000000000000000000000 -0.0000000000000000457127075236156239511828925399355339402327136821488351437 0.8577286100002721180857179206213913857936859130859375000000000000000000000 -0.0000000000000000481834479363366263073139653840536793747977248347259815464
|
||||
-0.8577286100002721180857179206213913857936859130859375000000000000000000000 0.0000000000000000481834479363366263073139653840536793747977248347259815464 0.5141027441932217723064013625844381749629974365234375000000000000000000000 -0.0000000000000000457127075236156239511828925399355339402327136821488351437
|
||||
0.8032075314806449428672863177780527621507644653320312500000000000000000000 -0.0000000000000000330606098048149096139985697961182567790418192774440364623 0.5956993044924333569056784654094371944665908813476562500000000000000000000 -0.0000000000000000134386419365794626148451101058230490536144850885393736695
|
||||
-0.5956993044924333569056784654094371944665908813476562500000000000000000000 0.0000000000000000134386419365794626148451101058230490536144850885393736695 0.8032075314806449428672863177780527621507644653320312500000000000000000000 -0.0000000000000000330606098048149096139985697961182567790418192774440364623
|
||||
0.1467304744553617479319029826001496985554695129394531250000000000000000000 0.0000000000000000037269471470465677476267642910816717181318621877547658805 0.9891765099647810144389836750633548945188522338867187500000000000000000000 -0.0000000000000000409873099370471113819595571800653989851994072636780924723
|
||||
-0.9891765099647810144389836750633548945188522338867187500000000000000000000 0.0000000000000000409873099370471113819595571800653989851994072636780924723 0.1467304744553617479319029826001496985554695129394531250000000000000000000 0.0000000000000000037269471470465677476267642910816717181318621877547658805
|
||||
sizeof_scalar: 128
|
||||
poly.size(): 64
|
||||
before cuda api: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
|
||||
0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
0.7071067811865475727373109293694142252206802368164062500000000000000000000 -0.0000000000000000483364665672645672552734986488347382506490835271488470948 0.7071067811865475727373109293694142252206802368164062500000000000000000000 -0.0000000000000000483364665672645672552734986488347382506490835271488470948
|
||||
0.9238795325112867384831361050601117312908172607421875000000000000000000000 0.0000000000000000176450470843366770599569725809896737820548554530070833879 0.3826834323650897817792326804919866845011711120605468750000000000000000000 -0.0000000000000000100507726964615876116866635215079604149716565502162279877
|
||||
-0.3826834323650897817792326804919866845011711120605468750000000000000000000 0.0000000000000000100507726964615876116866635215079604149716565502162279877 0.9238795325112867384831361050601117312908172607421875000000000000000000000 0.0000000000000000176450470843366770599569725809896737820548554530070833879
|
||||
0.9807852804032304305792422383092343807220458984375000000000000000000000000 0.0000000000000000185469399978250057259083323855314298012062932205111162598 0.1950903220161282758393639369387528859078884124755859375000000000000000000 -0.0000000000000000079910790684617312634428970369307223484248965070806183797
|
||||
-0.1950903220161282758393639369387528859078884124755859375000000000000000000 0.0000000000000000079910790684617312634428970369307223484248965070806183797 0.9807852804032304305792422383092343807220458984375000000000000000000000000 0.0000000000000000185469399978250057259083323855314298012062932205111162598
|
||||
0.5555702330196021776487214083317667245864868164062500000000000000000000000 0.0000000000000000470941094056167682138404887426404929202505139640422271441 0.8314696123025452356714026791451033204793930053710937500000000000000000000 0.0000000000000000014073856984728023893078524226353211699334975107286878349
|
||||
-0.8314696123025452356714026791451033204793930053710937500000000000000000000 -0.0000000000000000014073856984728023893078524226353211699334975107286878349 0.5555702330196021776487214083317667245864868164062500000000000000000000000 0.0000000000000000470941094056167682138404887426404929202505139640422271441
|
||||
0.9951847266721969287317506314138881862163543701171875000000000000000000000 -0.0000000000000000424869136783044157591677787910035744642531197693552380557 0.0980171403295606036287779261328978464007377624511718750000000000000000000 -0.0000000000000000016345823622442606095622288086432645091240042271949622910
|
||||
-0.0980171403295606036287779261328978464007377624511718750000000000000000000 0.0000000000000000016345823622442606095622288086432645091240042271949622910 0.9951847266721969287317506314138881862163543701171875000000000000000000000 -0.0000000000000000424869136783044157591677787910035744642531197693552380557
|
||||
0.6343932841636454877942696839454583823680877685546875000000000000000000000 0.0000000000000000104209019292800237913543214649613418305006117285573669085 0.7730104533627369933768136434082407504320144653320312500000000000000000000 -0.0000000000000000325659070336497908244691644422883020460214305129287293283
|
||||
-0.7730104533627369933768136434082407504320144653320312500000000000000000000 0.0000000000000000325659070336497908244691644422883020460214305129287293283 0.6343932841636454877942696839454583823680877685546875000000000000000000000 0.0000000000000000104209019292800237913543214649613418305006117285573669085
|
||||
0.8819212643483550495560052695509511977434158325195312500000000000000000000 -0.0000000000000000198432484058905683070896551742611855256204741109827063839 0.4713967368259976420397094898362411186099052429199218750000000000000000000 0.0000000000000000065166781360690129644720854136527256367167864498925970024
|
||||
-0.4713967368259976420397094898362411186099052429199218750000000000000000000 -0.0000000000000000065166781360690129644720854136527256367167864498925970024 0.8819212643483550495560052695509511977434158325195312500000000000000000000 -0.0000000000000000198432484058905683070896551742611855256204741109827063839
|
||||
0.2902846772544623865641710835916455835103988647460937500000000000000000000 -0.0000000000000000189279787077742607055779713593855819062153334180614860927 0.9569403357322088243819280251045711338520050048828125000000000000000000000 0.0000000000000000405538698618756882243805365526526472088078930809326716478
|
||||
-0.9569403357322088243819280251045711338520050048828125000000000000000000000 -0.0000000000000000405538698618756882243805365526526472088078930809326716478 0.2902846772544623865641710835916455835103988647460937500000000000000000000 -0.0000000000000000189279787077742607055779713593855819062153334180614860927
|
||||
0.9987954562051724050064649418345652520656585693359375000000000000000000000 -0.0000000000000000122916933370754648023069228195440464413706083235768885820 0.0490676743274180149345653489945107139647006988525390625000000000000000000 -0.0000000000000000006796103720518276261450983336612892861148698896541419864
|
||||
-0.0490676743274180149345653489945107139647006988525390625000000000000000000 0.0000000000000000006796103720518276261450983336612892861148698896541419864 0.9987954562051724050064649418345652520656585693359375000000000000000000000 -0.0000000000000000122916933370754648023069228195440464413706083235768885820
|
||||
0.6715589548470184411144145997241139411926269531250000000000000000000000000 -0.0000000000000000404890377492966801319705884961725462258731885432927466439 0.7409511253549591058842338497925084084272384643554687500000000000000000000 -0.0000000000000000147086169522973421017060338640487283358362467376636434357
|
||||
-0.7409511253549591058842338497925084084272384643554687500000000000000000000 0.0000000000000000147086169522973421017060338640487283358362467376636434357 0.6715589548470184411144145997241139411926269531250000000000000000000000000 -0.0000000000000000404890377492966801319705884961725462258731885432927466439
|
||||
0.9039892931234433381959547659789677709341049194335937500000000000000000000 -0.0000000000000000066097544687484293087601477313978651325255137129839264065 0.4275550934302820849097770405933260917663574218750000000000000000000000000 0.0000000000000000094111898162954679947731789440894191883937120943204313051
|
||||
-0.4275550934302820849097770405933260917663574218750000000000000000000000000 -0.0000000000000000094111898162954679947731789440894191883937120943204313051 0.9039892931234433381959547659789677709341049194335937500000000000000000000 -0.0000000000000000066097544687484293087601477313978651325255137129839264065
|
||||
0.3368898533922200511092626129538984969258308410644531250000000000000000000 -0.0000000000000000004200094003347528494601431516305399984771095349699076074 0.9415440651830208063088889502978418022394180297851562500000000000000000000 -0.0000000000000000278963795476983349094016418681950862291293519244599063978
|
||||
-0.9415440651830208063088889502978418022394180297851562500000000000000000000 0.0000000000000000278963795476983349094016418681950862291293519244599063978 0.3368898533922200511092626129538984969258308410644531250000000000000000000 -0.0000000000000000004200094003347528494601431516305399984771095349699076074
|
||||
0.9700312531945439742386838588572572916746139526367187500000000000000000000 0.0000000000000000183653003484288443908920092041031891826103053310748416949 0.2429801799032638986997056917971349321305751800537109375000000000000000000 -0.0000000000000000087514315297196631565769806659755604546308109536762592784
|
||||
-0.2429801799032638986997056917971349321305751800537109375000000000000000000 0.0000000000000000087514315297196631565769806659755604546308109536762592784 0.9700312531945439742386838588572572916746139526367187500000000000000000000 0.0000000000000000183653003484288443908920092041031891826103053310748416949
|
||||
0.5141027441932217723064013625844381749629974365234375000000000000000000000 -0.0000000000000000457127075236156239511828925399355339402327136821488351437 0.8577286100002721180857179206213913857936859130859375000000000000000000000 -0.0000000000000000481834479363366263073139653840536793747977248347259815464
|
||||
-0.8577286100002721180857179206213913857936859130859375000000000000000000000 0.0000000000000000481834479363366263073139653840536793747977248347259815464 0.5141027441932217723064013625844381749629974365234375000000000000000000000 -0.0000000000000000457127075236156239511828925399355339402327136821488351437
|
||||
0.8032075314806449428672863177780527621507644653320312500000000000000000000 -0.0000000000000000330606098048149096139985697961182567790418192774440364623 0.5956993044924333569056784654094371944665908813476562500000000000000000000 -0.0000000000000000134386419365794626148451101058230490536144850885393736695
|
||||
-0.5956993044924333569056784654094371944665908813476562500000000000000000000 0.0000000000000000134386419365794626148451101058230490536144850885393736695 0.8032075314806449428672863177780527621507644653320312500000000000000000000 -0.0000000000000000330606098048149096139985697961182567790418192774440364623
|
||||
0.1467304744553617479319029826001496985554695129394531250000000000000000000 0.0000000000000000037269471470465677476267642910816717181318621877547658805 0.9891765099647810144389836750633548945188522338867187500000000000000000000 -0.0000000000000000409873099370471113819595571800653989851994072636780924723
|
||||
-0.9891765099647810144389836750633548945188522338867187500000000000000000000 0.0000000000000000409873099370471113819595571800653989851994072636780924723 0.1467304744553617479319029826001496985554695129394531250000000000000000000 0.0000000000000000037269471470465677476267642910816717181318621877547658805
|
||||
0.0000000000000000000000000000000000000088162076311671563097655240291668426 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000002168404344971008868014905601739883422851562500000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000000000440810381558357815488276201458342129 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000008673617379884035472059622406959533691406250000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000000001851403602545102825050760046125036943 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000034694469519536141888238489627838134765625000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000000007493776486492082863300695424791816196 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000138777878078144567552953958511352539062500000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000000030063268022280003016300436939458933210 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000555111512312578270211815834045410156250000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000000120341234165431683628299402998127401267 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000002220446049250313080847263336181640625000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000000481453098738038406076295267232801273493 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000008881784197001252323389053344726562500000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000001925900557028465295868278724171496762396 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000035527136788005009293556213378906250000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000007703690390190172855036212551926278718010 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000142108547152020037174224853515625000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000030814849722837003091707947862945406540468 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000568434188608080148696899414062500000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000123259487053424324038394889107021917830297 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000002273736754432320594787597656250000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000000493038036375773607825142654083327962989612 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000009094947017729282379150390625000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000001972152233665170742972133713988552143626874 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000036379788070917129516601562500000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000007888609022822759283560097953609448866175922 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000145519152283668518066406250000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000031554436179453113445911954912093035756372115 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000582076609134674072265625000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000126217744805974530095319382746027383317156888 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000002328306436538696289062500000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000000504870979312060196692949094081764773560295977 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000009313225746154785156250000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000002019483917336402863083467939424714334532852332 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000037252902984619140625000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000008077935669433773528645543320796512578423077754 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000149011611938476562500000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000032311742677823256190893844846283705553983979440 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000596046447753906250000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000129246970711381186839887050948232477456227586186 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000002384185791015625000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000000516987882845612909435859875356027565065202013171 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000009536743164062500000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000002067951531382539799819751172987207915501099721110 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000038146972656250000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000008271806125530247361355316363511929317244690552867 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000152587890625000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000033087224502121077607497577125610814924219053879893 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000610351562500000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000132348898008484398592066620174006357352116507187999 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0002441406250000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000000529395592033937711917701562924776226282119750976562 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0009765625000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000002117582368135750847670806251699104905128479003906250 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0039062500000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000008470329472543003390683225006796419620513916015625000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0156250000000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000033881317890172013562732900027185678482055664062500000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0625000000000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000135525271560688054250931600108742713928222656250000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.2500000000000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
|
||||
0.0000000000000000000542101086242752217003726400434970855712890625000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0000000000000000000000000000000000000000000000000000000000000000000000000
|
||||
Params::degree 64
|
||||
opt 4
|
||||
BUTTERFLY_DEPTH 1
|
||||
LOG2_DEGREE 5
|
||||
HALF_DEGREE 16
|
||||
STRIDE 16
|
||||
Params::degree 32
|
||||
opt 2
|
||||
-0.3309726757578757561084614735591458156704902648925781250000000000000000000000000000000000000000000000 -0.0000000000000000262161441032774636130079215314589595089813221258498096566391666328854626044631004333 0.0380704474245701987578094360742397839203476905822753906250000000000000000000000000000000000000000000 -0.0000000000000000009616181682828339618607862290825046798652604462695437856356583949946070788428187370
|
||||
-0.1992581793808917978161332484887680038809776306152343750000000000000000000000000000000000000000000000 -0.0000000000000000075339409234980193504176368576269248248756960902950199909788864260917762294411659241 0.0176522535578225135688423819146919413469731807708740234375000000000000000000000000000000000000000000 -0.0000000000000000011079942189849169212186822638096097214445589711721970246904866996828786795958876610
|
||||
0.2260550165243644937795863825158448889851570129394531250000000000000000000000000000000000000000000000 0.0000000000000000126022745915227752500212969900318988585096703503442110150256638689825194887816905975 -0.0799689927978955578113939850481983739882707595825195312500000000000000000000000000000000000000000000 -0.0000000000000000063460023733636283936840017401873121454740939053999759789093104700441472232341766357
|
||||
0.2426504064692105999689886175474384799599647521972656250000000000000000000000000000000000000000000000 0.0000000000000000004677977136086304752259424595325139275587325315104779248354116560904003563337028027 0.0365333979932580613092518717621715040877461433410644531250000000000000000000000000000000000000000000 0.0000000000000000005709344770022019545359771749790997852063776745568752721271543748571275500580668449
|
||||
0.0914749448999244568758726359192223753780126571655273437500000000000000000000000000000000000000000000 -0.0000000000000000036609340702821524448717294490586665457459787183420747008755213869335420895367860794 0.2786308216747127586288002021319698542356491088867187500000000000000000000000000000000000000000000000 0.0000000000000000041379918916404281616895809751333352103206111649103329797050321303686359897255897522
|
||||
-0.0124926024286186147793653589133100467734038829803466796875000000000000000000000000000000000000000000 -0.0000000000000000006937190272456474979519993342187765906642550802765293666007906381310021970421075821 0.2110491012472701377600259320388431660830974578857421875000000000000000000000000000000000000000000000 -0.0000000000000000051785532317672362668742834231741586121061746167014080977075707323820097371935844421
|
||||
-0.0494172631142162574469089975082169985398650169372558593750000000000000000000000000000000000000000000 0.0000000000000000020700342936472428610421033900271927805735103256831817347682900276595319155603647232 -0.2029124738196469157625045909298933111131191253662109375000000000000000000000000000000000000000000000 0.0000000000000000032892946516813893505437931152017216374523186022500384484423818776122061535716056824
|
||||
0.0315092344667192991725102046984829939901828765869140625000000000000000000000000000000000000000000000 0.0000000000000000026511341643412592805126523014407753588898695525533522748817816250266332644969224930 -0.2988676928669205534738750884571345522999763488769531250000000000000000000000000000000000000000000000 -0.0000000000000000223519185626510679759809968042136173580417081359914335814131902679946506395936012268
|
||||
-0.1672368467298269156806611590582178905606269836425781250000000000000000000000000000000000000000000000 -0.0000000000000000048162869081133674839304134386709639116236975270920334102342508231231477111577987671 0.2727747087939790260335826133086811751127243041992187500000000000000000000000000000000000000000000000 -0.0000000000000000085823320193215261882172073504002543651078135111842128124326478655348182655870914459
|
||||
-0.1420260232345169348899815986442263238131999969482421875000000000000000000000000000000000000000000000 0.0000000000000000078353130752317586555795127350300028590782462991921192285715846992388833314180374146 0.1452463415301429028314572633462375961244106292724609375000000000000000000000000000000000000000000000 -0.0000000000000000130538715628498043839887390457914535898550603454414467674560285104234935715794563293
|
||||
0.1033287219076595153666175974649377167224884033203125000000000000000000000000000000000000000000000000 0.0000000000000000065787245791957383370689888773812031301831047645908498444633494273148244246840476990 -0.1956877991027157481784826131843146868050098419189453125000000000000000000000000000000000000000000000 -0.0000000000000000108434329174532879698514604527317036512444788844025493623757228078829939477145671844
|
||||
0.2183523172540651480222351210613851435482501983642578125000000000000000000000000000000000000000000000 -0.0000000000000000042952334406807644211423904079848498142548976906260858876773767178747220896184444427 -0.1611283900775630495427748201109352521598339080810546875000000000000000000000000000000000000000000000 0.0000000000000000049035193854080055673983422185833393998012569040749301929604797578576835803687572479
|
||||
0.2397036106352415651965515053234412334859371185302734375000000000000000000000000000000000000000000000 -0.0000000000000000052953977110488860650721110650147839714434454694985676681540098797995597124099731445 0.1115723403910117567106041747138078790158033370971679687500000000000000000000000000000000000000000000 -0.0000000000000000050113374213532207659401750409473001366666780484688051289232646468008169904351234436
|
||||
0.1398900907499183998705660769701353274285793304443359375000000000000000000000000000000000000000000000 0.0000000000000000025854512863426880011882823985481452679392879801315019427931929385522380471229553223 0.1766487032839706705633631145246908999979496002197265625000000000000000000000000000000000000000000000 0.0000000000000000047294440057263980645363900766920231270194547596580608023186798050119250547140836716
|
||||
-0.1644190411459663658799712493419065140187740325927734375000000000000000000000000000000000000000000000 -0.0000000000000000061410496854399099844472379531271533022912420237207631323883560980902984738349914551 -0.1171702953286619941497548325060051865875720977783203125000000000000000000000000000000000000000000000 0.0000000000000000018800082941866710238088777334985347003193960609764186220349557743247714824974536896
|
||||
-0.2271417216519029391719186605769209563732147216796875000000000000000000000000000000000000000000000000 -0.0000000000000000003410581758850931755798386799378379375470085294421492688732300280207709874957799911 -0.2324424613666220273966445120095158927142620086669921875000000000000000000000000000000000000000000000 0.0000000000000000052705161220801475140341690423664438031522828137140877780009162734131678007543087006
|
||||
-0.2768098499713873561489663188694976270198822021484375000000000000000000000000000000000000000000000000 0.0000000000000000271839307303537356402336171088715461786860875246282093220528963684046175330877304077 0.1777761056245066584313718749399413354694843292236328125000000000000000000000000000000000000000000000 0.0000000000000000031436349524061540422500698511335631041026594357601451484862664642605523113161325455
|
||||
-0.1816506822808992283047047067157109268009662628173828125000000000000000000000000000000000000000000000 -0.0000000000000000091744046880454127720279131854420783978544864776692377428268798666977090761065483093 0.0859671095812945679215388850025192368775606155395507812500000000000000000000000000000000000000000000 -0.0000000000000000019790221122024815278620829931698194201352001468739022493892321108432952314615249634
|
||||
0.1733917003068170814561455017610569484531879425048828125000000000000000000000000000000000000000000000 -0.0000000000000000002355669226462113621703291114551109740391745850677726603628059365291846916079521179 -0.1507412427231122065141022403622628189623355865478515625000000000000000000000000000000000000000000000 -0.0000000000000000050074860604269868780685029482838904707624616828879823837716855905455304309725761414
|
||||
0.2504400589351173378460657659161370247602462768554687500000000000000000000000000000000000000000000000 -0.0000000000000000105495846440776700797075650436594384534827739945562347090302068863820750266313552856 -0.0608093817831304034315920148401346523314714431762695312500000000000000000000000000000000000000000000 -0.0000000000000000006649734888645786681087537662594592372951811360499638214882622833101777359843254089
|
||||
0.1862075376662823411599845258024288341403007507324218750000000000000000000000000000000000000000000000 0.0000000000000000092738192415434320391903987162228447654735493063692976234424492076868773438036441803 0.2071792938507401871106594626326113939285278320312500000000000000000000000000000000000000000000000000 0.0000000000000000037279901254371484709180590427832381461048739621779671976620207374253368470817804337
|
||||
0.0649597768658507629391252180539595428854227066040039062500000000000000000000000000000000000000000000 -0.0000000000000000000599891608510439561166750358405470603488327470983847156116297583139385096728801727 0.2077132880219826616130518459613085724413394927978515625000000000000000000000000000000000000000000000 0.0000000000000000109365476335817531496040397320749442280186865305083299126920337585033848881721496582
|
||||
-0.1149666930854301022391084075024991761893033981323242187500000000000000000000000000000000000000000000 -0.0000000000000000046719677244762014686309320217246739870689257903384195014240276577766053378582000732 -0.1693829849301254908411351607355754822492599487304687500000000000000000000000000000000000000000000000 0.0000000000000000019290098218370279959780502759588964735884392347577739185160439205901639070361852646
|
||||
-0.1013849860233902999695487778808455914258956909179687500000000000000000000000000000000000000000000000 -0.0000000000000000010958363504292690122307333050960276846345426518287025779985199847033072728663682938 -0.2972510798575715429414856316725490614771842956542968750000000000000000000000000000000000000000000000 0.0000000000000000151060681535435571571128080956451898860237590587266010011546768510015681385993957520
|
||||
-0.0338490332315699790632912424825917696580290794372558593750000000000000000000000000000000000000000000 -0.0000000000000000032112051133747497270867287155696078651722246333124338724118729260226245969533920288 0.3056514274567712630137350515724392607808113098144531250000000000000000000000000000000000000000000000 -0.0000000000000000084660885661200502048130129231798305394126263491287077977975883413819246925413608551
|
||||
-0.0838544664005262951889108080649748444557189941406250000000000000000000000000000000000000000000000000 -0.0000000000000000051800512586075403991650040606243722019399192612806870350539156788727268576622009277 0.1888297106619587217846145676958258263766765594482421875000000000000000000000000000000000000000000000 0.0000000000000000125710005198254852176108184946779873729544425121429709890641390757082263007760047913
|
||||
0.0259785877288864451573058289568507461808621883392333984375000000000000000000000000000000000000000000 0.0000000000000000003808995729306562665530126889433911280323679558962660096632646400394150987267494202 -0.2127655593344512374720522984716808423399925231933593750000000000000000000000000000000000000000000000 -0.0000000000000000019935625096555455247881784143722497167520067235186101428645599753508577123284339905
|
||||
0.1435981506670927776436741396537399850785732269287109375000000000000000000000000000000000000000000000 0.0000000000000000131126020340999405450862544727246234289488926316605624244893135710299247875809669495 -0.2472170630286313064427616836837842129170894622802734375000000000000000000000000000000000000000000000 -0.0000000000000000122808089510788868829075694614557771638870714467047309814695665863837348297238349915
|
||||
0.2511343731265572865041235672833863645792007446289062500000000000000000000000000000000000000000000000 0.0000000000000000092452280298070285367329115251744832217129003532020067868124613141844747588038444519 0.0108656958033641493771570551984950725454837083816528320312500000000000000000000000000000000000000000 -0.0000000000000000002947488139454235324126363051764372718819140216213133855183148313017227337695658207
|
||||
0.2025455324622073560725254992576083168387413024902343750000000000000000000000000000000000000000000000 0.0000000000000000092268658403218626649354055433850145426009984179994624642340284026431618258357048035 0.1183192308221643201937567368986492510885000228881835937500000000000000000000000000000000000000000000 -0.0000000000000000033112219949499888538350744600518247336266158650688439663012374580830510240048170090
|
||||
-0.1933471753078349342214181660892791114747524261474609375000000000000000000000000000000000000000000000 0.0000000000000000056708946471137426758117517411030692033211668293211232816375400034303311258554458618 -0.0524965901614871180425048180495650740340352058410644531250000000000000000000000000000000000000000000 0.0000000000000000026002120264691327416023942568955159965262954086960469485267921641025168355554342270
|
||||
-0.3123928209210611317558914379333145916461944580078125000000000000000000000000000000000000000000000000 0.0000000000000000259207635313622839687841546274592684607857754166323344113820326128916349261999130249 -0.1116379705409853706488121360962395556271076202392578125000000000000000000000000000000000000000000000 0.0000000000000000066179487727938613466908117740857190639118025083514895712255565740633755922317504883
|
||||
32768
|
||||
340282366920938463463374607431768170496
|
||||
340282366920938463463374607431768080384
|
||||
340282366920938463463374607431768023040
|
||||
340282366920938463463374607431768092672
|
||||
340282366920938463463374607431768047616
|
||||
340282366920938463463374607431767556096
|
||||
71168
|
||||
277888
|
||||
1262592
|
||||
4018176
|
||||
16760832
|
||||
67108864
|
||||
268304384
|
||||
1074266112
|
||||
4295099124
|
||||
17180026739
|
||||
68719574034
|
||||
274877809558
|
||||
1099511650955
|
||||
4398046581374
|
||||
17592185995320
|
||||
70368743494672
|
||||
281474976801920
|
||||
1125899906931200
|
||||
4503599627216896
|
||||
18014398509543424
|
||||
72057594037780480
|
||||
288230376151842816
|
||||
1152921504606584832
|
||||
4611686018426339328
|
||||
18446744073709455341
|
||||
73786976294838222848
|
||||
295147905179352834048
|
||||
1180591620717411139584
|
||||
4722366482869645363200
|
||||
18889465931478580777984
|
||||
75557863725914323349504
|
||||
302231454903657293758464
|
||||
1208925819614629174814720
|
||||
4835703278458516698808960
|
||||
19342813113834066795323392
|
||||
77371252455336267180941312
|
||||
309485009821345068724764672
|
||||
1237940039285380274898993152
|
||||
4951760157141521099596627968
|
||||
19807040628566084398385463296
|
||||
79228162514264337593544017586
|
||||
316912650057057350374175694614
|
||||
1267650600228229401496703230278
|
||||
5070602400912917605986813129648
|
||||
20282409603651670423947251205115
|
||||
81129638414606681695789005047098
|
||||
324518553658426726783156020669176
|
||||
1298074214633706907132624082931600
|
||||
5192296858534827628530496329081216
|
||||
20769187434139310514121985316977152
|
||||
83076749736557242056487941267441664
|
||||
332306998946228968225951765070360576
|
||||
1329227995784915872903807060280262656
|
||||
5316911983139663491615228241121247232
|
||||
21267647932558653966460912964485251072
|
||||
85070591730234615865843651857941004288
|
||||
58358
|
||||
test core_crypto::gpu::algorithms::test::fft::test_roundtrip_u128 ... ok
|
||||
|
||||
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 354 filtered out; finished in 0.19s
|
||||
|
||||
|
||||
running 0 tests
|
||||
|
||||
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
|
||||
|
||||
|
||||
running 0 tests
|
||||
|
||||
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
|
||||
|
||||
|
||||
running 0 tests
|
||||
|
||||
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 200 filtered out; finished in 0.01s
|
||||
|
||||
0
gpu.logeset
Normal file
0
gpu.logeset
Normal file
68
parse_f128_twiddles.py
Normal file
68
parse_f128_twiddles.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
|
||||
def parse_file(filename):
|
||||
with open(filename, 'r') as f:
|
||||
# Read all non-empty lines (you can also filter out comments if needed)
|
||||
lines = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# The first line is expected to be the count (e.g. "4096")
|
||||
try:
|
||||
count = int(lines[0])
|
||||
except ValueError:
|
||||
sys.exit("Error: The first line must be an integer (the count of data lines).")
|
||||
|
||||
# Initialize lists for each twiddle array.
|
||||
neg_twiddles_re_hi = []
|
||||
neg_twiddles_re_lo = []
|
||||
neg_twiddles_im_hi = []
|
||||
neg_twiddles_im_lo = []
|
||||
|
||||
# Process each subsequent line.
|
||||
for i, line in enumerate(lines[1:], start=1):
|
||||
tokens = line.split()
|
||||
if len(tokens) != 4:
|
||||
sys.exit(f"Error on line {i+1}: expected 4 tokens, found {len(tokens)}.")
|
||||
neg_twiddles_re_hi.append(tokens[0])
|
||||
neg_twiddles_re_lo.append(tokens[1])
|
||||
neg_twiddles_im_hi.append(tokens[2])
|
||||
neg_twiddles_im_lo.append(tokens[3])
|
||||
|
||||
if len(neg_twiddles_re_hi) != count:
|
||||
print(f"Warning: Count mismatch. Expected {count} entries but found {len(neg_twiddles_re_hi)}.")
|
||||
count = len(neg_twiddles_re_hi) # adjust count to the actual number of data lines
|
||||
|
||||
return count, neg_twiddles_re_hi, neg_twiddles_re_lo, neg_twiddles_im_hi, neg_twiddles_im_lo
|
||||
|
||||
def print_cpp_array(name, count, values, indent=4, per_line=4):
|
||||
indent_str = " " * indent
|
||||
print(f"__device__ double {name}[{count}] = {{")
|
||||
for i, val in enumerate(values):
|
||||
# Print a newline every 'per_line' entries.
|
||||
if i % per_line == 0:
|
||||
print(indent_str, end="")
|
||||
print(val, end="")
|
||||
if i != len(values) - 1:
|
||||
print(", ", end="")
|
||||
if (i + 1) % per_line == 0:
|
||||
print("")
|
||||
# If the last line wasn't completed, print a newline.
|
||||
if len(values) % per_line != 0:
|
||||
print("")
|
||||
print("};\n")
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 2:
|
||||
sys.exit("Usage: python3 generate_twiddles.py <input_file>")
|
||||
|
||||
filename = sys.argv[1]
|
||||
count, re_hi, re_lo, im_hi, im_lo = parse_file(filename)
|
||||
|
||||
# Generate C++ arrays.
|
||||
print_cpp_array("neg_twiddles_re_hi", count, re_hi)
|
||||
print_cpp_array("neg_twiddles_re_lo", count, re_lo)
|
||||
print_cpp_array("neg_twiddles_im_hi", count, im_hi)
|
||||
print_cpp_array("neg_twiddles_im_lo", count, im_lo)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
4099
parsed_log.txt
Normal file
4099
parsed_log.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -35,8 +35,14 @@ fn two_diff(a: f64, b: f64) -> (f64, f64) {
|
||||
|
||||
#[inline(always)]
|
||||
fn two_prod(a: f64, b: f64) -> (f64, f64) {
|
||||
// println!("two_prod");
|
||||
// println!("a: {:?}", a);
|
||||
// println!("b: {:?}", b);
|
||||
let p = a * b;
|
||||
(p, f64::mul_add(a, b, -p))
|
||||
// println!("p: {:?}", p);
|
||||
let p2 = f64::mul_add(a, b, -p);
|
||||
// println!("p2: {:?}", p2);
|
||||
(p, p2)
|
||||
}
|
||||
|
||||
use core::{
|
||||
@@ -393,9 +399,21 @@ impl f128 {
|
||||
/// Multiplies `a` and `b` and returns the result.
|
||||
#[inline(always)]
|
||||
pub fn mul_f128_f128(a: f128, b: f128) -> Self {
|
||||
// println!("mul_f128_f128");
|
||||
// println!("a: {:?}", a);
|
||||
// println!("b: {:?}", b);
|
||||
let (p1, p2) = two_prod(a.0, b.0);
|
||||
// println!("p1: {:?}", p1);
|
||||
// println!("p2: {:?}", p2);
|
||||
let p2 = p2 + (a.0 * b.1 + a.1 * b.0);
|
||||
// println!("p2: {:?}", p2);
|
||||
|
||||
// println!("a.0 * b.1: {:?}", a.0 * b.1);
|
||||
// println!("a.1 * b.0: {:?}", a.1 * b.0);
|
||||
|
||||
let (p1, p2) = quick_two_sum(p1, p2);
|
||||
// println!("p1: {:?}", p1);
|
||||
// println!("p2: {:?}", p2);
|
||||
Self(p1, p2)
|
||||
}
|
||||
|
||||
|
||||
@@ -183,6 +183,7 @@ impl FftSimdF128 for Scalar {
|
||||
|
||||
#[inline(always)]
|
||||
fn mul(self, a: (Self::Reg, Self::Reg), b: (Self::Reg, Self::Reg)) -> (Self::Reg, Self::Reg) {
|
||||
println!("mul for scalar");
|
||||
let f128(o0, o1) = f128(a.0, a.1) * f128(b.0, b.1);
|
||||
(o0, o1)
|
||||
}
|
||||
@@ -316,15 +317,60 @@ trait FftSimdF128Ext: FftSimdF128 {
|
||||
b_re: (Self::Reg, Self::Reg),
|
||||
b_im: (Self::Reg, Self::Reg),
|
||||
) -> ((Self::Reg, Self::Reg), (Self::Reg, Self::Reg)) {
|
||||
println!("fftSimdF123::cplx_mul");
|
||||
let a_re_x_b_re = self.mul(a_re, b_re);
|
||||
let a_re_x_b_im = self.mul(a_re, b_im);
|
||||
let a_im_x_b_re = self.mul(a_im, b_re);
|
||||
let a_im_x_b_im = self.mul(a_im, b_im);
|
||||
|
||||
(
|
||||
let a_re_0: f64 = unsafe { std::mem::transmute_copy(&a_re.0) };
|
||||
let a_re_1: f64 = unsafe { std::mem::transmute_copy(&a_re.1) };
|
||||
|
||||
let a_im_0: f64 = unsafe { std::mem::transmute_copy(&a_im.0) };
|
||||
let a_im_1: f64 = unsafe { std::mem::transmute_copy(&a_im.1) };
|
||||
|
||||
let b_re_0: f64 = unsafe { std::mem::transmute_copy(&b_re.0) };
|
||||
let b_re_1: f64 = unsafe { std::mem::transmute_copy(&b_re.1) };
|
||||
|
||||
let b_im_0: f64 = unsafe { std::mem::transmute_copy(&b_im.0) };
|
||||
let b_im_1: f64 = unsafe { std::mem::transmute_copy(&b_im.1) };
|
||||
|
||||
let a_re_x_b_re_0: f64 = unsafe { std::mem::transmute_copy(&a_re_x_b_re.0) };
|
||||
let a_re_x_b_re_1: f64 = unsafe { std::mem::transmute_copy(&a_re_x_b_re.1) };
|
||||
|
||||
let a_re_x_b_im_0: f64 = unsafe { std::mem::transmute_copy(&a_re_x_b_im.0) };
|
||||
let a_re_x_b_im_1: f64 = unsafe { std::mem::transmute_copy(&a_re_x_b_im.1) };
|
||||
|
||||
let a_im_x_b_re_0: f64 = unsafe { std::mem::transmute_copy(&a_im_x_b_re.0) };
|
||||
let a_im_x_b_re_1: f64 = unsafe { std::mem::transmute_copy(&a_im_x_b_re.1) };
|
||||
|
||||
let a_im_x_b_im_0: f64 = unsafe { std::mem::transmute_copy(&a_im_x_b_im.0) };
|
||||
let a_im_x_b_im_1: f64 = unsafe { std::mem::transmute_copy(&a_im_x_b_im.1) };
|
||||
|
||||
println!("a_re: {:.100} {:.100}", a_re_0, a_re_1);
|
||||
println!("a_im: {:.100} {:.100}", a_im_0, a_im_1);
|
||||
println!("b_re: {:.100} {:.100}", b_re_0, b_re_1);
|
||||
println!("b_im: {:.100} {:.100}", b_im_0, b_im_1);
|
||||
|
||||
println!("a_re_x_b_re: {:.100} {:.100}", a_re_x_b_re_0, a_re_x_b_re_1);
|
||||
println!("a_re_x_b_im: {:.100} {:.100}", a_re_x_b_im_0, a_re_x_b_im_1);
|
||||
println!("a_im_x_b_re: {:.100} {:.100}", a_im_x_b_re_0, a_im_x_b_re_1);
|
||||
println!("a_im_x_b_im: {:.100} {:.100}", a_im_x_b_im_0, a_im_x_b_im_1);
|
||||
|
||||
let (c_re, c_im) = (
|
||||
self.sub(a_re_x_b_re, a_im_x_b_im),
|
||||
self.add(a_im_x_b_re, a_re_x_b_im),
|
||||
)
|
||||
);
|
||||
|
||||
let c_re_0: f64 = unsafe { std::mem::transmute_copy(&c_re.0) };
|
||||
let c_re_1: f64 = unsafe { std::mem::transmute_copy(&c_re.1) };
|
||||
|
||||
let c_im_0: f64 = unsafe { std::mem::transmute_copy(&c_im.0) };
|
||||
let c_im_1: f64 = unsafe { std::mem::transmute_copy(&c_im.1) };
|
||||
println!("a_re: {:.100} {:.100}", c_re_0, c_re_1);
|
||||
println!("a_im: {:.100} {:.100}", c_im_0, c_im_1);
|
||||
|
||||
(c_re, c_im)
|
||||
}
|
||||
|
||||
/// `a * conj(b)`
|
||||
@@ -366,7 +412,9 @@ pub fn negacyclic_fwd_fft_scalar(
|
||||
let mut m = 1;
|
||||
let simd = Scalar;
|
||||
|
||||
let mut ii = 0;
|
||||
while m < n {
|
||||
ii = ii + 1;
|
||||
t /= 2;
|
||||
|
||||
for i in 0..m {
|
||||
@@ -390,6 +438,7 @@ pub fn negacyclic_fwd_fft_scalar(
|
||||
{
|
||||
let (z0_re, z0_im) = ((*z0_re0, *z0_re1), (*z0_im0, *z0_im1));
|
||||
let (z1_re, z1_im) = ((*z1_re0, *z1_re1), (*z1_im0, *z1_im1));
|
||||
println!("{:?} {:?}", w1_re, w1_im);
|
||||
let (z1w_re, z1w_im) = simd.cplx_mul(z1_re, z1_im, w1_re, w1_im);
|
||||
|
||||
((*z0_re0, *z0_re1), (*z0_im0, *z0_im1)) =
|
||||
@@ -398,6 +447,7 @@ pub fn negacyclic_fwd_fft_scalar(
|
||||
simd.cplx_sub(z0_re, z0_im, z1w_re, z1w_im);
|
||||
}
|
||||
}
|
||||
//break;
|
||||
|
||||
m *= 2;
|
||||
}
|
||||
@@ -1050,22 +1100,22 @@ pub fn negacyclic_fwd_fft(
|
||||
twid_im0: &[f64],
|
||||
twid_im1: &[f64],
|
||||
) {
|
||||
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
{
|
||||
#[cfg(feature = "nightly")]
|
||||
if let Some(simd) = V4::try_new() {
|
||||
return negacyclic_fwd_fft_avx512(
|
||||
simd, data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0,
|
||||
twid_im1,
|
||||
);
|
||||
}
|
||||
if let Some(simd) = V3::try_new() {
|
||||
return negacyclic_fwd_fft_avxfma(
|
||||
simd, data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0,
|
||||
twid_im1,
|
||||
);
|
||||
}
|
||||
}
|
||||
// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
|
||||
// {
|
||||
// #[cfg(feature = "nightly")]
|
||||
// if let Some(simd) = V4::try_new() {
|
||||
// return negacyclic_fwd_fft_avx512(
|
||||
// simd, data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0,
|
||||
// twid_im1,
|
||||
// );
|
||||
// }
|
||||
// if let Some(simd) = V3::try_new() {
|
||||
// return negacyclic_fwd_fft_avxfma(
|
||||
// simd, data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0,
|
||||
// twid_im1,
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
negacyclic_fwd_fft_scalar(
|
||||
data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0, twid_im1,
|
||||
)
|
||||
@@ -1803,6 +1853,47 @@ fn bitreverse(i: usize, n: usize) -> usize {
|
||||
result
|
||||
}
|
||||
|
||||
fn f64_to_hexfloat(f: f64) -> String {
|
||||
// Special cases.
|
||||
if f.is_nan() {
|
||||
return "NaN".to_string();
|
||||
}
|
||||
if f.is_infinite() {
|
||||
return if f.is_sign_positive() {
|
||||
"inf".to_string()
|
||||
} else {
|
||||
"-inf".to_string()
|
||||
};
|
||||
}
|
||||
if f == 0.0 {
|
||||
// Handle both +0.0 and -0.0.
|
||||
return if f.is_sign_negative() {
|
||||
"-0x0p+0".to_string()
|
||||
} else {
|
||||
"0x0p+0".to_string()
|
||||
};
|
||||
}
|
||||
|
||||
let bits = f.to_bits();
|
||||
let sign = if bits >> 63 == 1 { "-" } else { "" };
|
||||
let exponent_bits = ((bits >> 52) & 0x7FF) as i32;
|
||||
let fraction = bits & ((1u64 << 52) - 1);
|
||||
|
||||
// For normalized numbers the printed form is "0x1.<fraction>p<exp>".
|
||||
// There are 52 fraction bits so printing 13 hex digits (since 13 * 4 = 52)
|
||||
// captures the entire fractional part.
|
||||
if exponent_bits != 0 {
|
||||
let frac_str = format!("{:013x}", fraction);
|
||||
let exp = exponent_bits - 1023;
|
||||
return format!("{}0x1.{}p{:+}", sign, frac_str, exp);
|
||||
} else {
|
||||
// Subnormal numbers: no implicit leading 1.
|
||||
let frac_str = format!("{:013x}", fraction);
|
||||
// For subnormals the effective exponent is fixed at -1022.
|
||||
return format!("{}0x0.{}p-1022", sign, frac_str);
|
||||
}
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn init_negacyclic_twiddles(
|
||||
twid_re0: &mut [f64],
|
||||
@@ -1824,6 +1915,10 @@ pub fn init_negacyclic_twiddles(
|
||||
twid_re1[pos] = c.1;
|
||||
twid_im0[pos] = s.0;
|
||||
twid_im1[pos] = s.1;
|
||||
|
||||
println!("{:.73} {:.73} {:.73} {:.73}", c.0, c.1, s.0, s.1);
|
||||
// println!("{} {} {} {}", f64_to_hexfloat(c.0), f64_to_hexfloat(c.1),
|
||||
// f64_to_hexfloat(s.0), f64_to_hexfloat(s.1));
|
||||
}
|
||||
m *= 2;
|
||||
}
|
||||
|
||||
@@ -140,6 +140,7 @@ pub fn u128_to_f64(x: u128) -> f64 {
|
||||
const B: f64 = (1u128 << 104) as f64;
|
||||
const C: f64 = (1u128 << 76) as f64;
|
||||
const D: f64 = u128::MAX as f64;
|
||||
|
||||
if x < 1 << 104 {
|
||||
let l = f64::from_bits(A.to_bits() | ((x << 12) as u64 >> 12)) - A;
|
||||
let h = f64::from_bits(B.to_bits() | (x >> 52) as u64) - B;
|
||||
|
||||
@@ -5,7 +5,7 @@ use dyn_stack::{GlobalPodBuffer, ReborrowMut};
|
||||
|
||||
fn test_roundtrip<Scalar: UnsignedTorus>() {
|
||||
let mut generator = new_random_generator();
|
||||
for size_log in 6..=14 {
|
||||
for size_log in 6..=6 {
|
||||
let size = 1_usize << size_log;
|
||||
let fourier_size = PolynomialSize(size).to_fourier_polynomial_size().0;
|
||||
|
||||
@@ -19,13 +19,42 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
|
||||
let mut fourier_im0 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
let mut fourier_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
|
||||
println!("sizeof_scalar: {:?}", Scalar::BITS);
|
||||
println!("poly.size(): {:?}", poly.len());
|
||||
|
||||
let mut base = Scalar::TWO * Scalar::TWO;
|
||||
let mut exp = base;
|
||||
for x in poly.as_mut().iter_mut() {
|
||||
*x = generator.random_uniform();
|
||||
*x = base - Scalar::ONE;
|
||||
base = base * exp;
|
||||
}
|
||||
|
||||
let n = poly.len();
|
||||
// for coef in poly.iter() {
|
||||
// println!("{:?}", coef);
|
||||
// }
|
||||
|
||||
let mut mem = GlobalPodBuffer::new(fft.backward_scratch().unwrap());
|
||||
let mut stack = PodStack::new(&mut mem);
|
||||
|
||||
let (standard_re, standard_im) = poly.split_at(n / 2);
|
||||
|
||||
convert_forward_torus(
|
||||
&mut fourier_re0,
|
||||
&mut fourier_re1,
|
||||
&mut fourier_im0,
|
||||
&mut fourier_im1,
|
||||
&standard_re,
|
||||
&standard_im,
|
||||
);
|
||||
for ((re0, re1), (im0, im1)) in (&*fourier_re0)
|
||||
.iter()
|
||||
.zip((&*fourier_re1).iter())
|
||||
.zip((&*fourier_im0).iter().zip((&*fourier_im1).iter()))
|
||||
{
|
||||
println!("{:.73} {:.73} {:.73} {:.73}", re0, re1, im0, im1);
|
||||
}
|
||||
|
||||
fft.forward_as_torus(
|
||||
&mut fourier_re0,
|
||||
&mut fourier_re1,
|
||||
@@ -33,6 +62,15 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
|
||||
&mut fourier_im1,
|
||||
&poly,
|
||||
);
|
||||
|
||||
for ((re0, re1), (im0, im1)) in (&*fourier_re0)
|
||||
.iter()
|
||||
.zip((&*fourier_re1).iter())
|
||||
.zip((&*fourier_im0).iter().zip((&*fourier_im1).iter()))
|
||||
{
|
||||
println!("{:.100} {:.100} {:.100} {:.100}", re0, re1, im0, im1);
|
||||
}
|
||||
|
||||
fft.backward_as_torus(
|
||||
&mut roundtrip,
|
||||
&fourier_re0,
|
||||
@@ -42,6 +80,10 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
|
||||
stack.rb_mut(),
|
||||
);
|
||||
|
||||
for coefficient in roundtrip.iter() {
|
||||
println!("{:?}", coefficient);
|
||||
}
|
||||
|
||||
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
|
||||
if Scalar::BITS <= 64 {
|
||||
assert_eq!(*expected, *actual);
|
||||
|
||||
274
tfhe/src/core_crypto/gpu/algorithms/test/fft.rs
Normal file
274
tfhe/src/core_crypto/gpu/algorithms/test/fft.rs
Normal file
@@ -0,0 +1,274 @@
|
||||
use super::*;
|
||||
use crate::core_crypto::commons::test_tools::{modular_distance, new_random_generator};
|
||||
use crate::core_crypto::commons::utils::izip;
|
||||
use crate::core_crypto::gpu::{
|
||||
fourier_transform_forward_as_integer_f128_async, fourier_transform_forward_as_torus_f128_async,
|
||||
CudaStreams,
|
||||
};
|
||||
use aligned_vec::avec;
|
||||
use dyn_stack::{GlobalPodBuffer, PodStack, ReborrowMut};
|
||||
use tfhe_cuda_backend::cuda_bind::cuda_synchronize_device;
|
||||
|
||||
fn test_roundtrip<Scalar: UnsignedTorus>() {
|
||||
let mut generator = new_random_generator();
|
||||
for size_log in 6..=6 {
|
||||
let size = 1_usize << size_log;
|
||||
let fourier_size = PolynomialSize(size).to_fourier_polynomial_size().0;
|
||||
|
||||
let fft = Fft128::new(PolynomialSize(size));
|
||||
let fft = fft.as_view();
|
||||
|
||||
let mut poly = avec![Scalar::ZERO; size].into_boxed_slice();
|
||||
let mut roundtrip = avec![Scalar::ZERO; size].into_boxed_slice();
|
||||
let mut fourier_re0 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
let mut fourier_re1 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
let mut fourier_im0 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
let mut fourier_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
if poly.len() > 64 {
|
||||
return;
|
||||
}
|
||||
println!("sizeof_scalar: {:?}", Scalar::BITS);
|
||||
println!("poly.size(): {:?}", poly.len());
|
||||
let mut base = Scalar::TWO * Scalar::TWO;
|
||||
let mut exp = base;
|
||||
for x in poly.as_mut().iter_mut() {
|
||||
*x = base - Scalar::ONE;
|
||||
base = base * exp;
|
||||
}
|
||||
|
||||
let mut mem = GlobalPodBuffer::new(fft.backward_scratch().unwrap());
|
||||
let mut stack = PodStack::new(&mut mem);
|
||||
|
||||
// fft.forward_as_torus(
|
||||
// &mut fourier_re0,
|
||||
// &mut fourier_re1,
|
||||
// &mut fourier_im0,
|
||||
// &mut fourier_im1,
|
||||
// &poly,
|
||||
// );
|
||||
let gpu_index = 0;
|
||||
let stream = CudaStreams::new_single_gpu(gpu_index);
|
||||
|
||||
unsafe {
|
||||
// println!("size: {:?}", size);
|
||||
// println!("poly.len: {:?}", poly.len());
|
||||
// println!("rust poly");
|
||||
// for coefficient in poly.iter() {
|
||||
// println!(
|
||||
// "{:0width$b}",
|
||||
// coefficient,
|
||||
// width = std::mem::size_of::<Scalar>() * 8
|
||||
// );
|
||||
// }
|
||||
// let mut rl = 0.;
|
||||
// for mut coef in fourier_im1.iter_mut() {
|
||||
// *coef = rl;
|
||||
// rl = rl + 1.;
|
||||
// }
|
||||
fourier_transform_forward_as_torus_f128_async(
|
||||
&stream,
|
||||
&mut fourier_re0,
|
||||
&mut fourier_re1,
|
||||
&mut fourier_im0,
|
||||
&mut fourier_im1,
|
||||
&poly,
|
||||
poly.len() as u32,
|
||||
1,
|
||||
);
|
||||
}
|
||||
unsafe {
|
||||
cuda_synchronize_device(0);
|
||||
}
|
||||
|
||||
for ((re0, re1), (im0, im1)) in (&*fourier_re0)
|
||||
.iter()
|
||||
.zip((&*fourier_re1).iter())
|
||||
.zip((&*fourier_im0).iter().zip((&*fourier_im1).iter()))
|
||||
{
|
||||
println!("{:.100} {:.100} {:.100} {:.100}", re0, re1, im0, im1);
|
||||
}
|
||||
fft.backward_as_torus(
|
||||
&mut roundtrip,
|
||||
&fourier_re0,
|
||||
&fourier_re1,
|
||||
&fourier_im0,
|
||||
&fourier_im1,
|
||||
stack.rb_mut(),
|
||||
);
|
||||
for coefficient in roundtrip.iter() {
|
||||
println!("{:?}", coefficient);
|
||||
}
|
||||
|
||||
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
|
||||
if Scalar::BITS <= 64 {
|
||||
assert_eq!(*expected, *actual);
|
||||
} else {
|
||||
let abs_diff = modular_distance(*expected, *actual);
|
||||
let threshold = Scalar::ONE << (128 - 100);
|
||||
assert!(
|
||||
abs_diff < threshold,
|
||||
"abs_diff: {abs_diff}, threshold: {threshold}",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// fn test_product<Scalar: UnsignedTorus>() {
|
||||
// fn convolution_naive<Scalar: UnsignedTorus>(
|
||||
// out: &mut [Scalar],
|
||||
// lhs: &[Scalar],
|
||||
// rhs: &[Scalar],
|
||||
// ) {
|
||||
// assert_eq!(out.len(), lhs.len());
|
||||
// assert_eq!(out.len(), rhs.len());
|
||||
// let n = out.len();
|
||||
// let mut full_prod = vec![Scalar::ZERO; 2 * n];
|
||||
// for i in 0..n {
|
||||
// for j in 0..n {
|
||||
// full_prod[i + j] = full_prod[i + j].wrapping_add(lhs[i].wrapping_mul(rhs[j]));
|
||||
// }
|
||||
// }
|
||||
// for i in 0..n {
|
||||
// out[i] = full_prod[i].wrapping_sub(full_prod[i + n]);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// let mut generator = new_random_generator();
|
||||
// for size_log in 6..=14 {
|
||||
// for _ in 0..10 {
|
||||
// let size = 1_usize << size_log;
|
||||
// let fourier_size = PolynomialSize(size).to_fourier_polynomial_size().0;
|
||||
//
|
||||
// let fft = Fft128::new(PolynomialSize(size));
|
||||
// let fft = fft.as_view();
|
||||
//
|
||||
// let mut poly0 = avec![Scalar::ZERO; size].into_boxed_slice();
|
||||
// let mut poly1 = avec![Scalar::ZERO; size].into_boxed_slice();
|
||||
//
|
||||
// let mut convolution_from_fft = avec![Scalar::ZERO; size].into_boxed_slice();
|
||||
// let mut convolution_from_naive = avec![Scalar::ZERO; size].into_boxed_slice();
|
||||
//
|
||||
// let mut fourier0_re0 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
// let mut fourier0_re1 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
// let mut fourier0_im0 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
// let mut fourier0_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
//
|
||||
// let mut fourier1_re0 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
// let mut fourier1_re1 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
// let mut fourier1_im0 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
// let mut fourier1_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
|
||||
//
|
||||
// let integer_magnitude = 16;
|
||||
// for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
|
||||
// *x = generator.random_uniform();
|
||||
// *y = generator.random_uniform();
|
||||
//
|
||||
// *y >>= Scalar::BITS - integer_magnitude;
|
||||
// }
|
||||
//
|
||||
// let mut mem = GlobalPodBuffer::new(fft.backward_scratch().unwrap());
|
||||
// let mut stack = PodStack::new(&mut mem);
|
||||
//
|
||||
// fft.forward_as_torus(
|
||||
// &mut fourier0_re0,
|
||||
// &mut fourier0_re1,
|
||||
// &mut fourier0_im0,
|
||||
// &mut fourier0_im1,
|
||||
// &poly0,
|
||||
// );
|
||||
// fft.forward_as_integer(
|
||||
// &mut fourier1_re0,
|
||||
// &mut fourier1_re1,
|
||||
// &mut fourier1_im0,
|
||||
// &mut fourier1_im1,
|
||||
// &poly1,
|
||||
// );
|
||||
//
|
||||
// for (f0_re0, f0_re1, f0_im0, f0_im1, f1_re0, f1_re1, f1_im0, f1_im1) in izip!(
|
||||
// &mut *fourier0_re0,
|
||||
// &mut *fourier0_re1,
|
||||
// &mut *fourier0_im0,
|
||||
// &mut *fourier0_im1,
|
||||
// &*fourier1_re0,
|
||||
// &*fourier1_re1,
|
||||
// &*fourier1_im0,
|
||||
// &*fourier1_im1,
|
||||
// ) {
|
||||
// let f0_re = f128(*f0_re0, *f0_re1);
|
||||
// let f0_im = f128(*f0_im0, *f0_im1);
|
||||
// let f1_re = f128(*f1_re0, *f1_re1);
|
||||
// let f1_im = f128(*f1_im0, *f1_im1);
|
||||
//
|
||||
// f128(*f0_re0, *f0_re1) = f0_re * f1_re - f0_im * f1_im;
|
||||
// f128(*f0_im0, *f0_im1) = f0_im * f1_re + f0_re * f1_im;
|
||||
// }
|
||||
//
|
||||
// fft.backward_as_torus(
|
||||
// &mut convolution_from_fft,
|
||||
// &fourier0_re0,
|
||||
// &fourier0_re1,
|
||||
// &fourier0_im0,
|
||||
// &fourier0_im1,
|
||||
// stack.rb_mut(),
|
||||
// );
|
||||
// convolution_naive(
|
||||
// convolution_from_naive.as_mut(),
|
||||
// poly0.as_ref(),
|
||||
// poly1.as_ref(),
|
||||
// );
|
||||
//
|
||||
// for (expected, actual) in izip!(
|
||||
// convolution_from_naive.as_ref().iter(),
|
||||
// convolution_from_fft.as_ref().iter()
|
||||
// ) {
|
||||
// let threshold = Scalar::ONE
|
||||
// << (Scalar::BITS.saturating_sub(100 - integer_magnitude - size_log));
|
||||
// let abs_diff = modular_distance(*expected, *actual);
|
||||
// assert!(
|
||||
// abs_diff <= threshold,
|
||||
// "abs_diff: {abs_diff}, threshold: {threshold}",
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn test_roundtrip_u32() {
|
||||
// test_roundtrip::<u32>();
|
||||
// }
|
||||
// #[test]
|
||||
// fn test_roundtrip_u64() {
|
||||
// test_roundtrip::<u64>();
|
||||
// }
|
||||
|
||||
#[test]
|
||||
fn test_roundtrip_u128() {
|
||||
test_roundtrip::<u128>();
|
||||
}
|
||||
|
||||
// fn test_roundtrip_u128<
|
||||
// Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize>,
|
||||
// >(
|
||||
// params: ClassicTestParams<Scalar>,
|
||||
// ) {
|
||||
// test_roundtrip::<u128>();
|
||||
// }
|
||||
|
||||
// create_gpu_parametrized_test!(test_roundtrip_u128);
|
||||
|
||||
// #[test]
|
||||
// fn test_product_u32() {
|
||||
// test_product::<u32>();
|
||||
// }
|
||||
//
|
||||
// #[test]
|
||||
// fn test_product_u64() {
|
||||
// test_product::<u64>();
|
||||
// }
|
||||
//
|
||||
// #[test]
|
||||
// fn test_product_u128() {
|
||||
// test_product::<u128>();
|
||||
// }
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::core_crypto::algorithms::test::*;
|
||||
|
||||
mod fft;
|
||||
mod glwe_sample_extraction;
|
||||
mod lwe_keyswitch;
|
||||
mod lwe_linear_algebra;
|
||||
|
||||
@@ -630,6 +630,56 @@ pub unsafe fn mult_lwe_ciphertext_vector_cleartext_vector<T: UnsignedInteger>(
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn fourier_transform_forward_as_integer_f128_async<T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
re0: &mut [f64],
|
||||
re1: &mut [f64],
|
||||
im0: &mut [f64],
|
||||
im1: &mut [f64],
|
||||
standard: &[T],
|
||||
fft_size: u32,
|
||||
number_of_samples: u32,
|
||||
) {
|
||||
println!("before cuda api: {:?}", re0);
|
||||
cuda_fourier_transform_forward_as_integer_f128_async(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0],
|
||||
re0.as_mut_ptr() as *mut c_void,
|
||||
re1.as_mut_ptr() as *mut c_void,
|
||||
im0.as_mut_ptr() as *mut c_void,
|
||||
im1.as_mut_ptr() as *mut c_void,
|
||||
standard.as_ptr() as *const c_void,
|
||||
fft_size,
|
||||
number_of_samples,
|
||||
);
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub unsafe fn fourier_transform_forward_as_torus_f128_async<T: UnsignedInteger>(
|
||||
streams: &CudaStreams,
|
||||
re0: &mut [f64],
|
||||
re1: &mut [f64],
|
||||
im0: &mut [f64],
|
||||
im1: &mut [f64],
|
||||
standard: &[T],
|
||||
fft_size: u32,
|
||||
number_of_samples: u32,
|
||||
) {
|
||||
println!("before cuda api: {:?}", re0);
|
||||
cuda_fourier_transform_forward_as_torus_f128_async(
|
||||
streams.ptr[0],
|
||||
streams.gpu_indexes[0],
|
||||
re0.as_mut_ptr() as *mut c_void,
|
||||
re1.as_mut_ptr() as *mut c_void,
|
||||
im0.as_mut_ptr() as *mut c_void,
|
||||
im1.as_mut_ptr() as *mut c_void,
|
||||
standard.as_ptr() as *const c_void,
|
||||
fft_size,
|
||||
number_of_samples,
|
||||
);
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CudaLweList<T: UnsignedInteger> {
|
||||
// Pointer to GPU data
|
||||
|
||||
4097
twids_complex_128_N_4096_hex_float.log
Normal file
4097
twids_complex_128_N_4096_hex_float.log
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user