Compare commits

...

3 Commits

Author SHA1 Message Date
Beka Barbakadze
d45cb74476 fix precision 2025-02-05 20:00:20 +04:00
Beka Barbakadze
772c049681 fix some bugs 2025-02-03 16:02:57 +04:00
Beka Barbakadze
516ae67990 feat(gpu): Implement fft128 in cuda backend 2025-01-20 15:43:19 +04:00
29 changed files with 39809 additions and 24 deletions

View File

@@ -62,6 +62,7 @@ fn main() {
"cuda/include/integer/integer.h",
"cuda/include/keyswitch.h",
"cuda/include/linear_algebra.h",
"cuda/include/pbs/fft.h",
"cuda/include/pbs/programmable_bootstrap.h",
"cuda/include/pbs/programmable_bootstrap_multibit.h",
];

View File

@@ -83,7 +83,7 @@ endif()
set(CMAKE_CUDA_FLAGS
"${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} ${OPTIMIZATION_FLAGS}\
-std=c++17 --no-exceptions --expt-relaxed-constexpr -rdc=true \
--use_fast_math -Xcompiler -fPIC")
--use_fast_math -Xcompiler -fPIC --ptxas-options=-v")
set(INCLUDE_DIR include)

View File

@@ -39,7 +39,7 @@ void *cuda_malloc_async(uint64_t size, cudaStream_t stream, uint32_t gpu_index);
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index);
void cuda_memcpy_async_gpu_to_gpu(void *dest, void const *src, uint64_t size,

View File

@@ -0,0 +1,17 @@
#include <stdint.h>
extern "C" {
void cuda_fourier_transform_forward_as_torus_f128_async(
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
void *im1, void const *standard, uint32_t const N,
const uint32_t number_of_samples);
void cuda_fourier_transform_forward_as_integer_f128_async(
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
void *im1, void const *standard, uint32_t const N,
const uint32_t number_of_samples);
void cuda_fourier_transform_backward_as_torus_f128_async(
void *stream, uint32_t gpu_index, void *standard, void const *re0,
void const *re1, void const *im0, void const *im1, uint32_t const N,
const uint32_t number_of_samples);
}

View File

@@ -97,7 +97,7 @@ bool cuda_check_support_thread_block_clusters() {
}
/// Copy memory to the GPU asynchronously
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
void cuda_memcpy_async_to_gpu(void *dest, const void *src, uint64_t size,
cudaStream_t stream, uint32_t gpu_index) {
if (size == 0)
return;

View File

@@ -0,0 +1,370 @@
#ifndef TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_F128_CUH_
#define TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_F128_CUH_
#include <cstdint>
#include <cstring>
struct alignas(16) f128 {
double hi;
double lo;
// Default and parameterized constructors
__host__ __device__ f128() : hi(0.0), lo(0.0) {}
__host__ __device__ f128(double high, double low) : hi(high), lo(low) {}
// Quick two-sum
__host__ __device__ __forceinline__ static f128 quick_two_sum(double a,
double b) {
#ifdef __CUDA_ARCH__
double s = __dadd_rn(a, b);
return f128(s, __dsub_rn(b, __dsub_rn(s, a)));
#else
double s = a + b;
return f128(s, b - (s - a));
#endif;
}
// Two-sum
__host__ __device__ __forceinline__ static f128 two_sum(double a, double b) {
#ifdef __CUDA_ARCH__
double s = __dadd_rn(a, b);
double bb = __dsub_rn(s, a);
return f128(s, __dadd_rn(__dsub_rn(a, __dsub_rn(s, bb)), __dsub_rn(b, bb)));
#else
double s = a + b;
double bb = s - a;
return f128(s, (a - (s - bb)) + (b - bb));
#endif
}
// Two-product
__host__ __device__ __forceinline__ static f128 two_prod(double a, double b) {
#ifdef __CUDA_ARCH__
double p = __dmul_rn(a, b);
double p2 = __fma_rn(a, b, -p);
#else
double p = a * b;
double p2 = fma(a, b, -p);
#endif
return f128(p, p2);
}
__host__ __device__ __forceinline__ static f128 two_diff(double a, double b) {
#ifdef __CUDA_ARCH__
double s = __dsub_rn(a, b);
double bb = __dsub_rn(s, a);
return f128(s, __dsub_rn(__dsub_rn(a, __dsub_rn(s, bb)), __dadd_rn(b, bb)));
#else
double s = a - b;
double bb = s - a;
return f128(s, (a - (s - bb)) - (b + bb));
#endif
}
// Addition
__host__ __device__ static f128 add(const f128 &a, const f128 &b) {
auto s = two_sum(a.hi, b.hi);
auto t = two_sum(a.lo, b.lo);
double hi = s.hi;
double lo = s.lo + t.hi;
hi = hi + lo;
lo = lo - (hi - s.hi);
return f128(hi, lo + t.lo);
}
// Addition with estimate
__host__ __device__ static f128 add_estimate(const f128 &a, const f128 &b) {
auto se = two_sum(a.hi, b.hi);
#ifdef __CUDA_ARCH__
se.lo = __dadd_rn(se.lo, __dadd_rn(a.lo, b.lo));
#else
se.lo += (a.lo + b.lo);
#endif
return quick_two_sum(se.hi, se.lo);
}
// Subtraction with estimate
__host__ __device__ static f128 sub_estimate(const f128 &a, const f128 &b) {
f128 se = two_diff(a.hi, b.hi);
#ifdef __CUDA_ARCH__
se.lo = __dadd_rn(se.lo, a.lo);
se.lo = __dsub_rn(se.lo, b.lo);
#else
se.lo += a.lo;
se.lo -= b.lo;
#endif
return quick_two_sum(se.hi, se.lo);
}
// Subtraction
__host__ __device__ static f128 sub(const f128 &a, const f128 &b) {
auto s = two_diff(a.hi, b.hi);
auto t = two_diff(a.lo, b.lo);
s = quick_two_sum(s.hi, s.lo + t.hi);
return quick_two_sum(s.hi, s.lo + t.lo);
}
// Multiplication
__host__ __device__ static f128 mul(const f128 &a, const f128 &b) {
auto p = two_prod(a.hi, b.hi);
#ifdef __CUDA_ARCH__
double a_0_x_b_1 = __dmul_rn(a.hi, b.lo);
double a_1_x_b_0 = __dmul_rn(a.lo, b.hi);
p.lo = __dadd_rn(p.lo, __dadd_rn(a_0_x_b_1, a_1_x_b_0));
#else
p.lo += (a.hi * b.lo + a.lo * b.hi);
#endif
p = quick_two_sum(p.hi, p.lo);
return p;
}
__host__ __device__ static void
cplx_f128_mul_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
auto a_re_x_b_re = mul(a_re, b_re);
auto a_re_x_b_im = mul(a_re, b_im);
auto a_im_x_b_re = mul(a_im, b_re);
auto a_im_x_b_im = mul(a_im, b_im);
c_re = sub_estimate(a_re_x_b_re, a_im_x_b_im);
c_im = add_estimate(a_im_x_b_re, a_re_x_b_im);
}
__host__ __device__ static void
cplx_f128_sub_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
c_re = sub_estimate(a_re, b_re);
c_im = sub_estimate(a_im, b_im);
}
__host__ __device__ static void
cplx_f128_add_assign(f128 &c_re, f128 &c_im, const f128 &a_re,
const f128 &a_im, const f128 &b_re, const f128 &b_im) {
c_re = add_estimate(a_re, b_re);
c_im = add_estimate(a_im, b_im);
}
};
struct f128x2 {
f128 re;
f128 im;
__host__ __device__ f128x2() : re(), im() {}
__host__ __device__ f128x2(const f128 &real, const f128 &imag)
: re(real), im(imag) {}
__host__ __device__ f128x2(double real, double imag)
: re(real, 0.0), im(imag, 0.0) {}
__host__ __device__ explicit f128x2(double real)
: re(real, 0.0), im(0.0, 0.0) {}
__host__ __device__ f128x2(const f128x2 &other)
: re(other.re), im(other.im) {}
__host__ __device__ f128x2(f128x2 &&other) noexcept
: re(std::move(other.re)), im(std::move(other.im)) {}
__host__ __device__ f128x2 &operator=(const f128x2 &other) {
if (this != &other) {
re = other.re;
im = other.im;
}
return *this;
}
__host__ __device__ f128x2 &operator=(f128x2 &&other) noexcept {
if (this != &other) {
re = std::move(other.re);
im = std::move(other.im);
}
return *this;
}
__host__ __device__ f128x2 conjugate() const {
return f128x2(re, f128(-im.hi, -im.lo));
}
__host__ __device__ f128 norm_squared() const {
return f128::add(f128::mul(re, re), f128::mul(im, im));
}
__host__ __device__ void zero() {
re = f128(0.0, 0.0);
im = f128(0.0, 0.0);
}
// Addition
__host__ __device__ friend f128x2 operator+(const f128x2 &a,
const f128x2 &b) {
return f128x2(f128::add(a.re, b.re), f128::add(a.im, b.im));
}
// Subtraction
__host__ __device__ friend f128x2 operator-(const f128x2 &a,
const f128x2 &b) {
return f128x2(f128::add(a.re, f128(-b.re.hi, -b.re.lo)),
f128::add(a.im, f128(-b.im.hi, -b.im.lo)));
}
// Multiplication (complex multiplication)
__host__ __device__ friend f128x2 operator*(const f128x2 &a,
const f128x2 &b) {
f128 real_part =
f128::add(f128::mul(a.re, b.re),
f128(-f128::mul(a.im, b.im).hi, -f128::mul(a.im, b.im).lo));
f128 imag_part = f128::add(f128::mul(a.re, b.im), f128::mul(a.im, b.re));
return f128x2(real_part, imag_part);
}
// Addition-assignment operator
__host__ __device__ f128x2 &operator+=(const f128x2 &other) {
re = f128::add(re, other.re);
im = f128::add(im, other.im);
return *this;
}
// Subtraction-assignment operator
__host__ __device__ f128x2 &operator-=(const f128x2 &other) {
re = f128::add(re, f128(-other.re.hi, -other.re.lo));
im = f128::add(im, f128(-other.im.hi, -other.im.lo));
return *this;
}
// Multiplication-assignment operator
__host__ __device__ f128x2 &operator*=(const f128x2 &other) {
f128 new_re =
f128::add(f128::mul(re, other.re), f128(-f128::mul(im, other.im).hi,
-f128::mul(im, other.im).lo));
f128 new_im = f128::add(f128::mul(re, other.im), f128::mul(im, other.re));
re = new_re;
im = new_im;
return *this;
}
};
__host__ __device__ inline uint64_t double_to_bits(double d) {
uint64_t bits = *reinterpret_cast<uint64_t *>(&d);
return bits;
}
__host__ __device__ inline double bits_to_double(uint64_t bits) {
double d = *reinterpret_cast<double *>(&bits);
return d;
}
__host__ __device__ double u128_to_f64(__uint128_t x) {
const __uint128_t ONE = 1;
const double A = ONE << 52;
const double B = ONE << 104;
const double C = ONE << 76;
const double D = 340282366920938500000000000000000000000.;
const __uint128_t threshold = (ONE << 104);
if (x < threshold) {
uint64_t A_bits = double_to_bits(A);
__uint128_t shifted = (x << 12);
uint64_t lower64 = static_cast<uint64_t>(shifted);
lower64 >>= 12;
uint64_t bits_l = A_bits | lower64;
double l_temp = bits_to_double(bits_l);
double l = l_temp - A;
uint64_t B_bits = double_to_bits(B);
uint64_t top64 = static_cast<uint64_t>(x >> 52);
uint64_t bits_h = B_bits | top64;
double h_temp = bits_to_double(bits_h);
double h = h_temp - B;
return (l + h);
} else {
uint64_t C_bits = double_to_bits(C);
__uint128_t shifted = (x >> 12);
uint64_t lower64 = static_cast<uint64_t>(shifted);
lower64 >>= 12;
uint64_t x_lo = static_cast<uint64_t>(x);
uint64_t mask_part = (x_lo & 0xFFFFFFULL);
uint64_t bits_l = C_bits | lower64 | mask_part;
double l_temp = bits_to_double(bits_l);
double l = l_temp - C;
uint64_t D_bits = double_to_bits(D);
uint64_t top64 = static_cast<uint64_t>(x >> 76);
uint64_t bits_h = D_bits | top64;
double h_temp = bits_to_double(bits_h);
double h = h_temp - D;
return (l + h);
}
}
__host__ __device__ __uint128_t f64_to_u128(const double f) {
const __uint128_t ONE = 1;
const uint64_t f_bits = double_to_bits(f);
if (f_bits < 1023ull << 52) {
return 0;
} else {
const __uint128_t m = ONE << 127 | (__uint128_t)f_bits << 75;
const uint64_t s = 1150 - (f_bits >> 52);
if (s >= 128) {
return 0;
} else {
return m >> s;
}
}
}
__host__ __device__ double i128_to_f64(__int128_t const x) {
uint64_t sign = static_cast<uint64_t>(x >> 64) & (1ULL << 63);
__uint128_t abs =
(x < 0) ? static_cast<__uint128_t>(-x) : static_cast<__uint128_t>(x);
return bits_to_double(double_to_bits(u128_to_f64(abs)) | sign);
}
__host__ __device__ f128 u128_to_signed_to_f128(__uint128_t x) {
const double first_approx = i128_to_f64(x);
const uint64_t sign_bit = double_to_bits(first_approx) & (1ull << 63);
const __uint128_t first_approx_roundtrip =
f64_to_u128((first_approx < 0) ? -first_approx : first_approx);
const __uint128_t first_approx_roundtrip_signed =
(sign_bit == (1ull << 63)) ? -first_approx_roundtrip
: first_approx_roundtrip;
double correction = i128_to_f64(x - first_approx_roundtrip_signed);
return f128(first_approx, correction);
};
#include <algorithm>
#include <string>
// Convert __uint128_t to decimal string
std::string to_string_128(__uint128_t value) {
if (value == 0)
return "0";
std::string result;
// Repeatedly divide by 10 and build the number in reverse
while (value > 0) {
unsigned digit = static_cast<unsigned>(value % 10);
result.push_back(static_cast<char>('0' + digit));
value /= 10;
}
// The digits are in reverse order, so reverse them
std::reverse(result.begin(), result.end());
return result;
} // TIP To <b>Run</b> code, press <shortcut actionId="Run"/> or
#endif

View File

@@ -0,0 +1,163 @@
#include "fft128.cuh"
void cuda_fourier_transform_forward_as_integer_f128_async(
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
void *im1, void const *standard, const uint32_t N,
const uint32_t number_of_samples) {
switch (N) {
case 64:
host_fourier_transform_forward_as_integer_f128<Degree<64>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 128:
host_fourier_transform_forward_as_integer_f128<Degree<128>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 256:
host_fourier_transform_forward_as_integer_f128<Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 512:
host_fourier_transform_forward_as_integer_f128<Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 1024:
host_fourier_transform_forward_as_integer_f128<Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 2048:
host_fourier_transform_forward_as_integer_f128<Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 4096:
host_fourier_transform_forward_as_integer_f128<Degree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
default:
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
void cuda_fourier_transform_forward_as_torus_f128_async(
void *stream, uint32_t gpu_index, void *re0, void *re1, void *im0,
void *im1, void const *standard, const uint32_t N,
const uint32_t number_of_samples) {
switch (N) {
case 64:
host_fourier_transform_forward_as_torus_f128<Degree<64>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 128:
host_fourier_transform_forward_as_torus_f128<Degree<128>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 256:
host_fourier_transform_forward_as_torus_f128<Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 512:
host_fourier_transform_forward_as_torus_f128<Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 1024:
host_fourier_transform_forward_as_torus_f128<Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 2048:
host_fourier_transform_forward_as_torus_f128<Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
case 4096:
host_fourier_transform_forward_as_torus_f128<Degree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (double *)re0,
(double *)re1, (double *)im0, (double *)im1,
(__uint128_t const *)standard, N, number_of_samples);
break;
default:
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}
void cuda_fourier_transform_backward_as_torus_f128_async(
void *stream, uint32_t gpu_index, void *standard, void const *re0,
void const *re1, void const *im0, void const *im1, const uint32_t N,
const uint32_t number_of_samples) {
switch (N) {
case 64:
host_fourier_transform_backward_as_torus_f128<Degree<64>>(
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
(double const *)re0, (double const *)re1, (double const *)im0,
(double const *)im1, N, number_of_samples);
break;
case 128:
host_fourier_transform_backward_as_torus_f128<Degree<128>>(
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
(double const *)re0, (double const *)re1, (double const *)im0,
(double const *)im1, N, number_of_samples);
break;
case 256:
host_fourier_transform_backward_as_torus_f128<Degree<256>>(
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
(double const *)re0, (double const *)re1, (double const *)im0,
(double const *)im1, N, number_of_samples);
break;
case 512:
host_fourier_transform_backward_as_torus_f128<Degree<512>>(
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
(double const *)re0, (double const *)re1, (double const *)im0,
(double const *)im1, N, number_of_samples);
break;
case 1024:
host_fourier_transform_backward_as_torus_f128<Degree<1024>>(
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
(double const *)re0, (double const *)re1, (double const *)im0,
(double const *)im1, N, number_of_samples);
break;
case 2048:
host_fourier_transform_backward_as_torus_f128<Degree<2048>>(
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
(double const *)re0, (double const *)re1, (double const *)im0,
(double const *)im1, N, number_of_samples);
break;
case 4096:
host_fourier_transform_backward_as_torus_f128<Degree<4096>>(
static_cast<cudaStream_t>(stream), gpu_index, (__uint128_t *)standard,
(double const *)re0, (double const *)re1, (double const *)im0,
(double const *)im1, N, number_of_samples);
break;
default:
PANIC("Cuda error (f128 fft): unsupported polynomial size. Supported "
"N's are powers of two"
" in the interval [256..4096].")
}
}

View File

@@ -0,0 +1,760 @@
#ifndef TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
#define TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_
#include "f128.cuh"
#include "pbs/fft.h"
#include "polynomial/functions.cuh"
#include "polynomial/parameters.cuh"
#include "twiddles.cuh"
#include "types/complex/operations.cuh"
#include <iostream>
using Index = unsigned;
#define NEG_TWID(i) \
f128x2(f128(neg_twiddles_re_hi[(i)], neg_twiddles_re_lo[(i)]), \
f128(neg_twiddles_im_hi[(i)], neg_twiddles_im_lo[(i)]))
#define F64x4_TO_F128x2(f128x2_reg, ind) \
f128x2_reg.re.hi = dt_re_hi[ind]; \
f128x2_reg.re.lo = dt_re_lo[ind]; \
f128x2_reg.im.hi = dt_im_hi[ind]; \
f128x2_reg.im.lo = dt_im_lo[ind]
#define F128x2_TO_F64x4(f128x2_reg, ind) \
dt_re_hi[ind] = f128x2_reg.re.hi; \
dt_re_lo[ind] = f128x2_reg.re.lo; \
dt_im_hi[ind] = f128x2_reg.im.hi; \
dt_im_lo[ind] = f128x2_reg.im.lo
// zl - left part of butterfly operation
// zr - right part of butterfly operation
// re - real part
// im - imaginary part
// hi - high bits
// lo - low bits
// dt - list
// cf - single coefficient
template <class params>
__device__ void negacyclic_forward_fft_f128(double *dt_re_hi, double *dt_re_lo,
double *dt_im_hi,
double *dt_im_lo) {
__syncthreads();
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
constexpr Index LOG2_DEGREE = params::log2_degree;
constexpr Index HALF_DEGREE = params::degree >> 1;
constexpr Index STRIDE = params::degree / params::opt;
f128x2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
Index tid = threadIdx.x;
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0) {
printf("BUTTERFLY_DEPTH %d\n", BUTTERFLY_DEPTH);
printf("LOG2_DEGREE %d\n", LOG2_DEGREE);
printf("HALF_DEGREE %d\n", HALF_DEGREE);
printf("STRIDE %d\n", STRIDE);
printf("Params::degree %d\n", params::degree);
printf("opt %d\n", params::opt);
}
__syncthreads();
// load into registers
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
F64x4_TO_F128x2(u[i], tid);
F64x4_TO_F128x2(v[i], tid + HALF_DEGREE);
// u[i].re.hi = dt_re_hi[tid];
// u[i].re.lo = dt_re_lo[tid];
// u[i].im.hi = dt_im_hi[tid];
// u[i].im.lo = dt_im_lo[tid];
// v[i].re.hi = dt_re_hi[tid + HALF_DEGREE];
// v[i].re.lo = dt_re_lo[tid + HALF_DEGREE];
// v[i].im.hi = dt_im_hi[tid + HALF_DEGREE];
// v[i].im.lo = dt_im_lo[tid + HALF_DEGREE];
// F64x4_TO_F128x2(u[i], tid);
// F64x4_TO_F128x2(v[i], tid + HALF_DEGREE);
tid += STRIDE;
}
// level 1
// we don't make actual complex multiplication on level1 since we have only
// one twiddle, it's real and image parts are equal, so we can multiply
// it with simpler operations
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
auto ww = NEG_TWID(1);
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, NEG_TWID(1).re,
NEG_TWID(1).im);
// w = v[i] * NEG_TWID(1);
// __syncthreads();
// if (threadIdx.x == 0 && blockIdx.x == 0) {
// printf("w = %.5f %.5f %.5f %.5f\n", ww.re.hi, ww.re.lo, ww.im.hi,
// ww.im.lo); printf("u = %.5f %.5f %.5f %.5f\n", u[i].re.hi,
// u[i].re.lo, u[i].im.hi, u[i].im.lo); printf("v = %.5f %.5f %.5f
// %.5f\n", v[i].re.hi, v[i].re.lo, v[i].im.hi, v[i].im.lo); printf("wv
// = %.5f %.5f %.5f %.5f\n", w.re.hi, w.re.lo, w.im.hi, w.im.lo);
// }
// __syncthreads();
// v[i] = u[i] - w;
// u[i] = u[i] + w;
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re, w.im);
f128::cplx_f128_add_assign(u[i].re, u[i].im, u[i].re, u[i].im, w.re, w.im);
}
// tid = threadIdx.x;
// #pragma unroll
// for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
// F128x2_TO_F64x4(u[i], (tid));
// F128x2_TO_F64x4(v[i], (tid + HALF_DEGREE));
// tid = tid + STRIDE;
// }
// __syncthreads();
Index twiddle_shift = 1;
int ii = 0;
for (Index l = LOG2_DEGREE - 1; l >= 1; --l) {
ii++;
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
twiddle_shift <<= 1;
tid = threadIdx.x;
__syncthreads();
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F128x2_TO_F64x4(((u_stays_in_register) ? v[i] : u[i]), tid);
tid = tid + STRIDE;
}
__syncthreads();
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F64x4_TO_F128x2(w, tid ^ lane_mask);
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
w = NEG_TWID(tid / lane_mask + twiddle_shift);
// w *= v[i];
f128::cplx_f128_mul_assign(w.re, w.im, v[i].re, v[i].im, w.re, w.im);
f128::cplx_f128_sub_assign(v[i].re, v[i].im, u[i].re, u[i].im, w.re,
w.im);
f128::cplx_f128_add_assign(u[i].re, u[i].im, u[i].re, u[i].im, w.re,
w.im);
tid = tid + STRIDE;
}
}
__syncthreads();
// store registers in SM
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
F128x2_TO_F64x4(u[i], tid * 2);
F128x2_TO_F64x4(v[i], (tid * 2 + 1));
tid = tid + STRIDE;
}
__syncthreads();
}
template <class params>
__device__ void negacyclic_inverse_fft_f128(double *dt_re_hi, double *dt_re_lo,
double *dt_im_hi,
double *dt_im_lo) {
__syncthreads();
constexpr Index BUTTERFLY_DEPTH = params::opt >> 1;
constexpr Index LOG2_DEGREE = params::log2_degree;
constexpr Index DEGREE = params::degree;
constexpr Index HALF_DEGREE = params::degree >> 1;
constexpr Index STRIDE = params::degree / params::opt;
size_t tid = threadIdx.x;
f128x2 u[BUTTERFLY_DEPTH], v[BUTTERFLY_DEPTH], w;
// load into registers and divide by compressed polynomial size
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
F64x4_TO_F128x2(u[i], 2 * tid);
F64x4_TO_F128x2(v[i], 2 * tid + 1);
tid += STRIDE;
}
Index twiddle_shift = DEGREE;
for (Index l = 1; l <= LOG2_DEGREE - 1; ++l) {
Index lane_mask = 1 << (l - 1);
Index thread_mask = (1 << l) - 1;
tid = threadIdx.x;
twiddle_shift >>= 1;
// at this point registers are ready for the butterfly
tid = threadIdx.x;
__syncthreads();
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = (u[i] - v[i]);
u[i] += v[i];
v[i] = w * NEG_TWID(tid / lane_mask + twiddle_shift).conjugate();
// keep one of the register for next iteration and store another one in sm
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F128x2_TO_F64x4((u_stays_in_register) ? v[i] : u[i], tid);
tid = tid + STRIDE;
}
__syncthreads();
// prepare registers for next butterfly iteration
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
Index rank = tid & thread_mask;
bool u_stays_in_register = rank < lane_mask;
F64x4_TO_F128x2(w, tid ^ lane_mask);
u[i] = (u_stays_in_register) ? u[i] : w;
v[i] = (u_stays_in_register) ? w : v[i];
tid = tid + STRIDE;
}
}
// last iteration
for (Index i = 0; i < BUTTERFLY_DEPTH; ++i) {
w = (u[i] - v[i]);
u[i] = u[i] + v[i];
v[i] = w * NEG_TWID(1).conjugate();
}
__syncthreads();
// store registers in SM
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < BUTTERFLY_DEPTH; i++) {
F128x2_TO_F64x4(u[i], tid);
F128x2_TO_F64x4(v[i], tid + HALF_DEGREE);
tid = tid + STRIDE;
}
__syncthreads();
}
// params is expected to be full degree not half degree
template <class params>
__device__ void convert_u128_to_f128_as_integer(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re, const __uint128_t *in_im) {
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {
__syncthreads();
auto out_re = u128_to_signed_to_f128(in_re[tid]);
__syncthreads();
auto out_im = u128_to_signed_to_f128(in_im[tid]);
__syncthreads();
out_re_hi[tid] = out_re.hi;
out_re_lo[tid] = out_re.lo;
out_im_hi[tid] = out_im.hi;
out_im_lo[tid] = out_im.lo;
// __syncthreads();
// if (threadIdx.x == 0 && blockIdx.x == 0) {
// printf("%.5f %.5f %.5f %.5f\n", out_re_hi[tid], out_re_lo[tid],
// out_im_hi[tid],
// out_im_lo[tid]);
// }
// __syncthreads();
tid += params::degree / params::opt;
}
}
// params is expected to be full degree not half degree
template <class params>
__device__ void convert_u128_to_f128_as_torus(
double *out_re_hi, double *out_re_lo, double *out_im_hi, double *out_im_lo,
const __uint128_t *in_re, const __uint128_t *in_im) {
const double normalization = pow(2., -128.);
Index tid = threadIdx.x;
// #pragma unroll
for (Index i = 0; i < params::opt / 2; i++) {
__syncthreads();
auto out_re = u128_to_signed_to_f128(in_re[tid]);
__syncthreads();
auto out_im = u128_to_signed_to_f128(in_im[tid]);
__syncthreads();
out_re_hi[tid] = out_re.hi * normalization;
out_re_lo[tid] = out_re.lo * normalization;
out_im_hi[tid] = out_im.hi * normalization;
out_im_lo[tid] = out_im.lo * normalization;
// __syncthreads();
// if (threadIdx.x == 0 && blockIdx.x == 0) {
// printf("%.5f %.5f %.5f %.5f\n", out_re_hi[tid], out_re_lo[tid],
// out_im_hi[tid],
// out_im_lo[tid]);
// }
// __syncthreads();
tid += params::degree / params::opt;
}
}
// params is expected to be full degree not half degree
template <class params>
__global__ void
batch_convert_u128_to_f128_as_integer(double *out_re_hi, double *out_re_lo,
double *out_im_hi, double *out_im_lo,
const __uint128_t *in) {
convert_u128_to_f128_as_integer<params>(
&out_re_hi[blockIdx.x * params::degree / 2],
&out_re_lo[blockIdx.x * params::degree / 2],
&out_im_hi[blockIdx.x * params::degree / 2],
&out_im_lo[blockIdx.x * params::degree / 2],
&in[blockIdx.x * params::degree],
&in[blockIdx.x * params::degree + params::degree / 2]);
}
// params is expected to be full degree not half degree
template <class params>
__global__ void
batch_convert_u128_to_f128_as_torus(double *out_re_hi, double *out_re_lo,
double *out_im_hi, double *out_im_lo,
const __uint128_t *in) {
convert_u128_to_f128_as_torus<params>(
&out_re_hi[blockIdx.x * params::degree / 2],
&out_re_lo[blockIdx.x * params::degree / 2],
&out_im_hi[blockIdx.x * params::degree / 2],
&out_im_lo[blockIdx.x * params::degree / 2],
&in[blockIdx.x * params::degree],
&in[blockIdx.x * params::degree + params::degree / 2]);
}
template <class params, sharedMemDegree SMD>
__global__ void
batch_NSMFFT_128(double *in_re_hi, double *in_re_lo, double *in_im_hi,
double *in_im_lo, double *out_re_hi, double *out_re_lo,
double *out_im_hi, double *out_im_lo, double *buffer) {
extern __shared__ double sharedMemoryFFT[];
double *re_hi, *re_lo, *im_hi, *im_lo;
// debug
__syncthreads();
if (threadIdx.x == 0 && blockIdx.x == 0) {
printf("Params::degree %d\n", params::degree);
printf("opt %d\n", params::opt);
}
__syncthreads();
if (SMD == NOSM) {
re_hi =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 0];
re_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 1];
im_hi =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 2];
im_lo =
&buffer[blockIdx.x * params::degree / 2 * 4 + params::degree / 2 * 3];
} else {
re_hi = &sharedMemoryFFT[params::degree / 2 * 0];
re_lo = &sharedMemoryFFT[params::degree / 2 * 1];
im_hi = &sharedMemoryFFT[params::degree / 2 * 2];
im_lo = &sharedMemoryFFT[params::degree / 2 * 3];
}
Index tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < params::opt / 2; ++i) {
re_hi[tid] = in_re_hi[blockIdx.x * (params::degree / 2) + tid];
re_lo[tid] = in_re_lo[blockIdx.x * (params::degree / 2) + tid];
im_hi[tid] = in_im_hi[blockIdx.x * (params::degree / 2) + tid];
im_lo[tid] = in_im_lo[blockIdx.x * (params::degree / 2) + tid];
tid += params::degree / params::opt;
}
__syncthreads();
negacyclic_forward_fft_f128<HalfDegree<params>>(re_hi, re_lo, im_hi, im_lo);
__syncthreads();
tid = threadIdx.x;
#pragma unroll
for (Index i = 0; i < params::opt / 2; ++i) {
out_re_hi[blockIdx.x * (params::degree / 2) + tid] = re_hi[tid];
out_re_lo[blockIdx.x * (params::degree / 2) + tid] = re_lo[tid];
out_im_hi[blockIdx.x * (params::degree / 2) + tid] = im_hi[tid];
out_im_lo[blockIdx.x * (params::degree / 2) + tid] = im_lo[tid];
tid += params::degree / params::opt;
}
}
void print_uint128_bits(__uint128_t value) {
char buffer[129]; // 128 bits + null terminator
buffer[128] = '\0'; // Null-terminate the string
for (int i = 127; i >= 0; --i) {
buffer[i] = (value & 1) ? '1' : '0'; // Extract the least significant bit
value >>= 1; // Shift right by 1 bit
}
printf("%s\n", buffer);
}
template <class params>
__host__ void host_fourier_transform_forward_as_integer_f128(
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
double *im0, double *im1, const __uint128_t *standard, const uint32_t N,
const uint32_t number_of_samples) {
// for (int i = 0; i < N / 2; i++)
// {
// printf("%.10f\n", re0[i]);
// }
// printf("cpp_poly_host\n");
// for (int i = 0; i < N; i++) {
// print_uint128_bits(standard[i]);
// }
// printf("check #1\n");
// for (int i = 0; i < 32; i++) {
// standard[i + 32] = standard[i];
// }
// allocate device buffers
double *d_re0 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_re1 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_im0 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_im1 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
N * sizeof(__uint128_t), stream, gpu_index);
// copy input into device
cuda_memcpy_async_to_gpu(d_standard, standard, N * sizeof(__uint128_t),
stream, gpu_index);
// setup launch parameters
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
int grid_size = number_of_samples;
int block_size = params::degree / params::opt;
bool full_sm =
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
// configure shared memory for batch fft kernel
if (full_sm) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
}
// convert u128 into 4 x double
batch_convert_u128_to_f128_as_integer<params>
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
d_standard);
// call negacyclic 128 bit forward fft.
if (full_sm) {
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
} else {
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
}
cudaDeviceSynchronize();
// print_debug("re_hi", d_re0, 32);
// print_debug("d_re_lo", d_re1, 32);
// print_debug("d_im_hi", d_im0, 32);
// print_debug("d_im_lo", d_im1, 32);
cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_cpu(re1, d_re1, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_cpu(im0, d_im0, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_cpu(im1, d_im1, N / 2 * sizeof(double), stream,
gpu_index);
cuda_drop_async(d_standard, stream, gpu_index);
cuda_drop_async(d_re0, stream, gpu_index);
cuda_drop_async(d_re1, stream, gpu_index);
cuda_drop_async(d_im0, stream, gpu_index);
cuda_drop_async(d_im1, stream, gpu_index);
cudaDeviceSynchronize();
// printf("params::degree: %d\n", params::degree);
// printf("params::opt: %d\n", params::opt);
// printf("N: %d\n", N);
// for (int i = 0; i < N; i++)
// {
// printf("%s\n", to_string_128(standard[i]).c_str());
// }
//
// for (int i = 0; i < N / 2; i++) {
//// auto re = u128_to_signed_to_f128(standard[i]);
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
// }
}
__global__ void print_twiddles(int N) {
for (int i = 0; i < N / 2; i++) {
printf("%.73f %.73f %.73f %.73f\n", neg_twiddles_re_hi[i],
neg_twiddles_re_lo[i], neg_twiddles_im_hi[i], neg_twiddles_im_lo[i]);
}
}
__global__ void print_c128(double *re0, double *re1, double *im0, double *im1,
int N) {
for (int i = 0; i < N / 2; i++) {
printf("%.73f %.73f %.73f %.73f\n", re0[i], re1[i], im0[i], im1[i]);
}
}
template <class params>
__host__ void host_fourier_transform_forward_as_torus_f128(
cudaStream_t stream, uint32_t gpu_index, double *re0, double *re1,
double *im0, double *im1, const __uint128_t *standard, const uint32_t N,
const uint32_t number_of_samples) {
print_twiddles<<<1, 1>>>(N);
cudaDeviceSynchronize();
// for (int i = 0; i < N / 2; i++)
// {
// printf("%.10f\n", re0[i]);
// }
// printf("cpp_poly_host\n");
// for (int i = 0; i < N; i++) {
// print_uint128_bits(standard[i]);
// }
// printf("check #1\n");
// for (int i = 0; i < 32; i++) {
// standard[i + 32] = standard[i];
// }
// allocate device buffers
double *d_re0 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_re1 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_im0 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_im1 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
N * sizeof(__uint128_t), stream, gpu_index);
// copy input into device
cuda_memcpy_async_to_gpu(d_standard, standard, N * sizeof(__uint128_t),
stream, gpu_index);
// setup launch parameters
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
int grid_size = number_of_samples;
int block_size = params::degree / params::opt;
bool full_sm =
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
// configure shared memory for batch fft kernel
if (full_sm) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
}
// convert u128 into 4 x double
batch_convert_u128_to_f128_as_torus<params>
<<<grid_size, block_size, 0, stream>>>(d_re0, d_re1, d_im0, d_im1,
d_standard);
print_c128<<<1, 1>>>(d_re0, d_re1, d_im0, d_im1, N);
cudaDeviceSynchronize();
// call negacyclic 128 bit forward fft.
if (full_sm) {
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
} else {
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM>
<<<grid_size, block_size, shared_memory_size, stream>>>(
d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
}
cudaDeviceSynchronize();
// print_debug("re_hi", d_re0, 32);
// print_debug("d_re_lo", d_re1, 32);
// print_debug("d_im_hi", d_im0, 32);
// print_debug("d_im_lo", d_im1, 32);
cuda_memcpy_async_to_cpu(re0, d_re0, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_cpu(re1, d_re1, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_cpu(im0, d_im0, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_cpu(im1, d_im1, N / 2 * sizeof(double), stream,
gpu_index);
cuda_drop_async(d_standard, stream, gpu_index);
cuda_drop_async(d_re0, stream, gpu_index);
cuda_drop_async(d_re1, stream, gpu_index);
cuda_drop_async(d_im0, stream, gpu_index);
cuda_drop_async(d_im1, stream, gpu_index);
cudaDeviceSynchronize();
// printf("params::degree: %d\n", params::degree);
// printf("params::opt: %d\n", params::opt);
// printf("N: %d\n", N);
// for (int i = 0; i < N; i++)
// {
// printf("%s\n", to_string_128(standard[i]).c_str());
// }
// for (int i = 0; i < N / 2; i++) {
//// auto re = u128_to_signed_to_f128(standard[i]);
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
// }
}
template <class params>
__host__ void host_fourier_transform_backward_as_torus_f128(
cudaStream_t stream, uint32_t gpu_index, __uint128_t *standard,
double const *re0, double const *re1, double const *im0, double const *im1,
const uint32_t N, const uint32_t number_of_samples) {
// allocate device buffers
double *d_re0 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_re1 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_im0 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
double *d_im1 =
(double *)cuda_malloc_async(N / 2 * sizeof(double), stream, gpu_index);
__uint128_t *d_standard = (__uint128_t *)cuda_malloc_async(
N * sizeof(__uint128_t), stream, gpu_index);
// // copy input into device
cuda_memcpy_async_to_gpu(d_re0, standard, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_gpu(d_re1, standard, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_gpu(d_im0, standard, N / 2 * sizeof(double), stream,
gpu_index);
cuda_memcpy_async_to_gpu(d_im1, standard, N / 2 * sizeof(double), stream,
gpu_index);
// setup launch parameters
size_t required_shared_memory_size = sizeof(double) * N / 2 * 4;
int grid_size = number_of_samples;
int block_size = params::degree / params::opt;
bool full_sm =
(required_shared_memory_size <= cuda_get_max_shared_memory(gpu_index));
size_t buffer_size = full_sm ? 0 : (size_t)number_of_samples * N / 2 * 4;
size_t shared_memory_size = full_sm ? required_shared_memory_size : 0;
double *buffer = (double *)cuda_malloc_async(buffer_size, stream, gpu_index);
// configure shared memory for batch fft kernel
if (full_sm) {
check_cuda_error(cudaFuncSetAttribute(
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
check_cuda_error(cudaFuncSetCacheConfig(
batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, FULLSM>,
cudaFuncCachePreferShared));
}
// // convert u128 into 4 x double
// batch_convert_u128_to_f128_as_torus<params><<<grid_size, block_size, 0,
// stream>>>(
// d_re0, d_re1, d_im0, d_im1, d_standard);
// call negacyclic 128 bit forward fft.
// if (full_sm) {
// negacyclic_inverse_fft_f128<FFTDegree<params, ForwardFFT>,
// FULLSM><<<grid_size, block_size, shared_memory_size, stream>>>
// (d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
// } else {
// batch_NSMFFT_128<FFTDegree<params, ForwardFFT>, NOSM><<<grid_size,
// block_size, shared_memory_size, stream>>>
// (d_re0, d_re1, d_im0, d_im1, d_re0, d_re1, d_im0, d_im1, buffer);
//
// }
//
// cudaDeviceSynchronize();
//// print_debug("re_hi", d_re0, 32);
//// print_debug("d_re_lo", d_re1, 32);
//// print_debug("d_im_hi", d_im0, 32);
//// print_debug("d_im_lo", d_im1, 32);
//
cuda_memcpy_async_to_cpu(standard, d_standard, N * sizeof(__uint128_t),
stream, gpu_index);
cuda_drop_async(d_standard, stream, gpu_index);
cuda_drop_async(d_re0, stream, gpu_index);
cuda_drop_async(d_re1, stream, gpu_index);
cuda_drop_async(d_im0, stream, gpu_index);
cuda_drop_async(d_im1, stream, gpu_index);
cudaDeviceSynchronize();
//
// printf("params::degree: %d\n", params::degree);
// printf("params::opt: %d\n", params::opt);
// printf("N: %d\n", N);
// for (int i = 0; i < N; i++)
// {
// printf("%s\n", to_string_128(standard[i]).c_str());
// }
//
// for (int i = 0; i < N / 2; i++) {
//// auto re = u128_to_signed_to_f128(standard[i]);
//// auto im = u128_to_signed_to_f128(standard[i + N / 2]);
//// printf("%.10f %.10f %.10f %.10f\n", re.hi, re.lo, im.hi, im.lo);
// printf("%.10f %.10f %.10f %.10f\n", re0[i], re1[i], im0[i], im1[i]);
// }
//
}
#endif // TFHE_RS_BACKENDS_TFHE_CUDA_BACKEND_CUDA_SRC_FFT128_FFT128_CUH_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
#ifndef GPU_BOOTSTRAP_128_TWIDDLES_CUH
#define GPU_BOOTSTRAP_128_TWIDDLES_CUH
/*
* 'negtwiddles' are stored in device memory to profit caching
*/
extern __device__ double neg_twiddles_re_hi[4096];
extern __device__ double neg_twiddles_re_lo[4096];
extern __device__ double neg_twiddles_im_hi[4096];
extern __device__ double neg_twiddles_im_lo[4096];
#endif

View File

@@ -1238,6 +1238,45 @@ extern "C" {
input_lwe_ciphertext_count: u32,
);
}
extern "C" {
pub fn cuda_fourier_transform_forward_as_torus_f128_async(
stream: *mut ffi::c_void,
gpu_index: u32,
re0: *mut ffi::c_void,
re1: *mut ffi::c_void,
im0: *mut ffi::c_void,
im1: *mut ffi::c_void,
standard: *const ffi::c_void,
N: u32,
number_of_samples: u32,
);
}
extern "C" {
pub fn cuda_fourier_transform_forward_as_integer_f128_async(
stream: *mut ffi::c_void,
gpu_index: u32,
re0: *mut ffi::c_void,
re1: *mut ffi::c_void,
im0: *mut ffi::c_void,
im1: *mut ffi::c_void,
standard: *const ffi::c_void,
N: u32,
number_of_samples: u32,
);
}
extern "C" {
pub fn cuda_fourier_transform_backward_as_torus_f128_async(
stream: *mut ffi::c_void,
gpu_index: u32,
standard: *mut ffi::c_void,
re0: *const ffi::c_void,
re1: *const ffi::c_void,
im0: *const ffi::c_void,
im1: *const ffi::c_void,
N: u32,
number_of_samples: u32,
);
}
extern "C" {
pub fn cuda_fourier_polynomial_mul(
stream: *mut ffi::c_void,

View File

@@ -4,5 +4,6 @@
#include "cuda/include/integer/integer.h"
#include "cuda/include/keyswitch.h"
#include "cuda/include/linear_algebra.h"
#include "cuda/include/pbs/fft.h"
#include "cuda/include/pbs/programmable_bootstrap.h"
#include "cuda/include/pbs/programmable_bootstrap_multibit.h"

5
bench_log.txt Normal file
View File

@@ -0,0 +1,5 @@
Finished `bench` profile [optimized] target(s) in 0.10s
Running benches/integer/bench.rs (target/release/deps/integer_bench-f3741775ea815609)
Benchmarking integer::cuda::div_rem/integer::cuda::div_rem::PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS:...
Benchmarking integer::cuda::div_rem/integer::cuda::div_rem::PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS:...: Warming up for 3.0000 s
Benchmarking integer::cuda::div_rem/integer::cuda::div_rem::PARAM_GPU_MULTI_BIT_MESSAGE_2_CARRY_2_GROUP_3_KS_PBS:...: Collecting 15 samples in estimated 100.21 s (240 iterations)

653
branches.log Normal file
View File

@@ -0,0 +1,653 @@
al/fix_scalar_shifts
al/signed_scalar_div
bb/signed_div
bench/gpu/div_rem
chore/gpu/sum_ciphertexts_comments
chre/gpu/integer_mult_gpu_params
cuda/sprint_demo_bench
feat/cuda/abs
feat/cuda/accumulate_tbc_with_registers
feat/cuda/new_type_cast
feat/cuda/new_type_cast_no_round
feat/cuda/signed_div
feat/cuda_fft/reduce_sm_io
feat/cuda_fft/shuffle_csel
feat/gpu/apply_lut_with_factor
feat/gpu/cuda_vector_add
feat/gpu/cufftdx
feat/gpu/div
feat/gpu/div_backup
feat/gpu/div_backup2
feat/gpu/div_backup2_cleanup
feat/gpu/div_rem_update_info
feat/gpu/fft128
feat/gpu/fft_4_step
feat/gpu/generate_last_block_inner_propagation
feat/gpu/host_rotate_blocks
feat/gpu/mul_boolean_optimization
feat/gpu/overflow_add
feat/gpu/overflowing_scalar_add_and_sub
feat/gpu/overflowing_sub
feat/gpu/propagate_single_carry_get_input_carries
feat/gpu/reuse_pbs_memory
feat/gpu/shifts_one_wave
feat/gpu/signed_overflowing_add_or_sub
feat/gpu/signed_overflowing_add_or_sub_with_cuda
feat/gpu/signed_overflowing_sub
feat/gpu/signed_overflowing_sub_with_cuda
feat/gpu/signed_scalar_add
feat/gpu/unsigned_overflowing_scalar_add
fix/gpu/div_mod/refactor_memory
fix/gpu/fix_terms_degree_bug
* fix/gpu/full_carry_prop_noise_level
fix/gpu/single_carry_propagation
main
new_fft/fft_wide
sk/feat/cuda-fft
tm/full-prop-clean
remotes/origin/0.5.vector
remotes/origin/HEAD -> origin/main
remotes/origin/add-c-api-gpu
remotes/origin/add-license-faq
remotes/origin/add_glwe_keyswitch
remotes/origin/al/add_mem_check
remotes/origin/al/add_vm_targets_erc20
remotes/origin/al/avoid_sync_broadcast_lut
remotes/origin/al/backup_multi_gpu
remotes/origin/al/balanced_decomposition
remotes/origin/al/bench_multi_gpu_throughput
remotes/origin/al/chore/measure_pbs_timings
remotes/origin/al/chore/measure_scalar_shift_timings
remotes/origin/al/ci_fixes
remotes/origin/al/clean_unused_functions
remotes/origin/al/cuda_backend_build
remotes/origin/al/cuda_malloc
remotes/origin/al/cufftdx
remotes/origin/al/debug_gpu
remotes/origin/al/debug_launch_failure
remotes/origin/al/default_config_gpu
remotes/origin/al/div_multi_gpu
remotes/origin/al/div_perf_multi_gpu
remotes/origin/al/erc20_workflows
remotes/origin/al/fix_clippy_error
remotes/origin/al/fix_compression
remotes/origin/al/fix_full_prop
remotes/origin/al/fix_gpu_index
remotes/origin/al/fix_mul_mem
remotes/origin/al/fix_scalar_shifts
remotes/origin/al/fix_shift_rotate_buffer
remotes/origin/al/fix_sm_size
remotes/origin/al/fix_tests
remotes/origin/al/fixes
remotes/origin/al/gpu_array_hl_api
remotes/origin/al/gpu_same_params_as_cpu
remotes/origin/al/improve_cmux
remotes/origin/al/improve_scalar_div
remotes/origin/al/investigate_launch_failure_bug
remotes/origin/al/investigate_rtx_bugs
remotes/origin/al/investigate_scalar_add_sub_failures
remotes/origin/al/l40_bench
remotes/origin/al/long_run_tests
remotes/origin/al/many_lut_test
remotes/origin/al/multi_gpu_debug
remotes/origin/al/multi_gpu_dispatch_lwe
remotes/origin/al/multi_gpu_dispatch_lwe_alt
remotes/origin/al/multi_gpu_doc
remotes/origin/al/multi_gpu_fix_div
remotes/origin/al/multi_gpu_refactor_rebased
remotes/origin/al/natural_decompoisition_order
remotes/origin/al/noise_test
remotes/origin/al/nvidia-smi-in-workflows
remotes/origin/al/nvidia-smi-in-workflows-alt
remotes/origin/al/optimize_mul_mem
remotes/origin/al/p2p_mutex
remotes/origin/al/p5_bench_workflow
remotes/origin/al/pfail_gpu
remotes/origin/al/pin_ksk_host
remotes/origin/al/pinned_host_mem_bsk
remotes/origin/al/pinned_host_memory
remotes/origin/al/reduce_memory_pressure_scalar_mul
remotes/origin/al/refactor_lwe_chunk_size
remotes/origin/al/remove_host_decoration
remotes/origin/al/remove_mempool_acess
remotes/origin/al/remove_ms_comp
remotes/origin/al/remove_omp_div
remotes/origin/al/reset_test_threads
remotes/origin/al/restrict_number_of_gpus
remotes/origin/al/reuse_mem_cmux
remotes/origin/al/rust_binding
remotes/origin/al/set_higher_max_shared_memory
remotes/origin/al/signed_long_run_tests
remotes/origin/al/signed_scalar_div
remotes/origin/al/simplify_4090_bench
remotes/origin/al/small_fixes
remotes/origin/al/sum_ct_single_gpu
remotes/origin/al/sum_ct_vec_multi_gpu
remotes/origin/al/test_device_other_than_zero
remotes/origin/al/troubleshoot_h100_tests
remotes/origin/al/tuniform_params
remotes/origin/al/use_byte_add
remotes/origin/am/chore/01-to-02-keys
remotes/origin/am/chore/05-to-06
remotes/origin/am/chore/add-assert-for-lwe-dim-pbs
remotes/origin/am/chore/add-noise-and-security-curves-crates
remotes/origin/am/chore/bench-size
remotes/origin/am/chore/bump-version
remotes/origin/am/chore/bump-zk-version
remotes/origin/am/chore/change-instance-gpu-build
remotes/origin/am/chore/ci-auto-data-branch
remotes/origin/am/chore/clippy-bug
remotes/origin/am/chore/comp
remotes/origin/am/chore/compress-1-bits
remotes/origin/am/chore/container-metadata
remotes/origin/am/chore/dep-updates
remotes/origin/am/chore/do-not-run-fft-workflows-on-push
remotes/origin/am/chore/doc-casting
remotes/origin/am/chore/doc-link-fix
remotes/origin/am/chore/docs-0-8
remotes/origin/am/chore/expose-rng
remotes/origin/am/chore/expose-rng-0.7
remotes/origin/am/chore/expose-rng-0.8
remotes/origin/am/chore/fix-fft-bench-parser
remotes/origin/am/chore/fix-incorrec-comment
remotes/origin/am/chore/fix-wasm-timeout
remotes/origin/am/chore/investigate-wasm-again
remotes/origin/am/chore/make-ks-level-order-consistent
remotes/origin/am/chore/multi-bit-4-4
remotes/origin/am/chore/multi-bit-test
remotes/origin/am/chore/no-test-internal
remotes/origin/am/chore/pk-ks-params-big-key
remotes/origin/am/chore/python-web-driver
remotes/origin/am/chore/remove-open-handle-detection
remotes/origin/am/chore/remove-remaining-modular-std-dev
remotes/origin/am/chore/remove-serde-derive-glwe-body
remotes/origin/am/chore/remove-some-concrete-branding
remotes/origin/am/chore/sampling-tool
remotes/origin/am/chore/sampling-tool-karatsuba
remotes/origin/am/chore/saved-ntt
remotes/origin/am/chore/slack-notif-status
remotes/origin/am/chore/tfhe-fft
remotes/origin/am/chore/tfhe-ntt
remotes/origin/am/chore/trivium-test
remotes/origin/am/chore/update-nightly
remotes/origin/am/chore/update-wasm-bench
remotes/origin/am/chore/update-wop-parameters
remotes/origin/am/chore/utils-shortint
remotes/origin/am/chore/wop-bug
remotes/origin/am/chore/zk-v2-manage-upper-bound-params
remotes/origin/am/dev/forward-comp-as-0-5
remotes/origin/am/doc/forward-compatibility
remotes/origin/am/doc/migrate-data-to-0-5
remotes/origin/am/docs-revamp
remotes/origin/am/doctest_bug
remotes/origin/am/doctest_bug_minify
remotes/origin/am/exp/g4
remotes/origin/am/feat/TUniform
remotes/origin/am/feat/any-q-decomposition
remotes/origin/am/feat/cpk-cast
remotes/origin/am/feat/csk-decompress
remotes/origin/am/feat/debug-pbs-count
remotes/origin/am/feat/deparallelized-multi-bit
remotes/origin/am/feat/forward-comp-as-0-5
remotes/origin/am/feat/keyswitch-flexible-modulus
remotes/origin/am/feat/keytricks-part1
remotes/origin/am/feat/keytricks-part2
remotes/origin/am/feat/keytricks-part3
remotes/origin/am/feat/ks-with-scalar-change
remotes/origin/am/feat/more-efficient-integer-list-expansion
remotes/origin/am/feat/ntt-pbs
remotes/origin/am/feat/par-ksk-gen
remotes/origin/am/feat/prime-q-lwe-glwe-ggsw
remotes/origin/am/feat/prime-q-support-part4
remotes/origin/am/feat/prime-q-support-part5
remotes/origin/am/feat/q
remotes/origin/am/feat/raw-parts-for-ksk
remotes/origin/am/feat/semver-trick
remotes/origin/am/feat/updated-ntt-pbs
remotes/origin/am/feat/v2-padding-bit
remotes/origin/am/feat/zk-metadata
remotes/origin/am/feat/zk-proofs
remotes/origin/am/feat/zk-v1-padding
remotes/origin/am/fix/balanced-decomposition-gpu
remotes/origin/am/fix/pedro-mbb-wasm
remotes/origin/am/fix/shortint-ksk-noise-level
remotes/origin/am/fix/trivial-0-wop
remotes/origin/am/fix/use-proper-stair-ks-dim
remotes/origin/am/fix/wop-crt-lut-gen
remotes/origin/am/hack/shortint-wop-return-keys
remotes/origin/am/hack/tuniform
remotes/origin/am/noise_test
remotes/origin/am/refactor-ggsw-encryption-inputs-consistent
remotes/origin/am/refactor/better-compressed-pk
remotes/origin/am/refactor/bsk-pfpksk-match-decomposer-order
remotes/origin/am/refactor/core-c-api
remotes/origin/am/refactor/factorize-expansion-code
remotes/origin/am/refactor/ggsw-encryption-inputs-consistent
remotes/origin/am/refactor/non-native-decomposer
remotes/origin/am/refactor/rng-fallibility
remotes/origin/am/refactor/shortint-factorize-key-noise
remotes/origin/am/refactor/shortint-params
remotes/origin/am/refactor/simplify-shortint-decompression
remotes/origin/am/refactor/use-natural-decomposition-order
remotes/origin/am/saved/crt_wip
remotes/origin/am/test/add-some-noise-formulas
remotes/origin/am/test/malicious-booleans
remotes/origin/am/test/many-lut
remotes/origin/am/test/noise-checks
remotes/origin/am/wip-f
remotes/origin/am/wip/fft-sampling
remotes/origin/am/wip/ntt
remotes/origin/am/wip/pfail
remotes/origin/am/wip/prime-q-greedy
remotes/origin/am/wip/rng-next
remotes/origin/am/wip/simulation
remotes/origin/am/wip/sts-testing
remotes/origin/am/wip/tensor-prod
remotes/origin/am/wip/updated-q
remotes/origin/am/wip/zk-bench
remotes/origin/artifact_ccs_2024
remotes/origin/bb/signed_div
remotes/origin/bb/signed_overflow_add
remotes/origin/bc/ci-estimator
remotes/origin/bc/ci/estimator
remotes/origin/bench/gpu/div_rem
remotes/origin/bench/gpu/pbs
remotes/origin/bench_trace_packing_keyswitch
remotes/origin/better-conformance
remotes/origin/bitnot-free
remotes/origin/blockchain-usecase
remotes/origin/blogpost_example_0.2
remotes/origin/c-threading
remotes/origin/cb/feat/add_glwe_tensor_product_and_lwe_trace_packing_keyswitch
remotes/origin/cb/feat/support_large_custom_modulus_multiplication
remotes/origin/chore/add-slsa-framework
remotes/origin/chore/as_optimize_pks_ml
remotes/origin/chore/gpu/4090_core_crypto_bench
remotes/origin/chore/gpu/bench_gpu_multi_bit_gf_4_pbs
remotes/origin/chore/gpu/bench_scratch
remotes/origin/chore/gpu/cuda_vec
remotes/origin/chore/gpu/fourier_bsk
remotes/origin/chore/gpu/get_lwe_chunk_size_refactor
remotes/origin/chore/gpu/lwechunksize_refactor
remotes/origin/chore/gpu/measure_ks_time
remotes/origin/chore/gpu/multibitpbs
remotes/origin/chore/gpu/pbs_naming
remotes/origin/chore/gpu/pbsstruct
remotes/origin/chore/gpu/reduce_scratch_time
remotes/origin/chore/gpu/reduce_sumctvec_memory
remotes/origin/chore/gpu/remove_amortized_degree
remotes/origin/chore/gpu/speedup_signed_comparisons
remotes/origin/chore/gpu/sum_ciphertexts_comments
remotes/origin/chore/multi_bit_pbs_test
remotes/origin/chore/param
remotes/origin/chre/gpu/integer_mult_gpu_params
remotes/origin/ci/bench_big_machine
remotes/origin/ci/bench_multi_bits
remotes/origin/ci/parallel_bench
remotes/origin/ci/select_benchmarks
remotes/origin/ci/wip
remotes/origin/clean_smart_ops
remotes/origin/cuda/sprint_demo_bench
remotes/origin/custom_modulus_trace_packing_KS_and_tensor_product
remotes/origin/custom_modulus_trace_packing_KS_and_tensor_product_trivial_approach_decomposer
remotes/origin/dd/fix/bls446-serialize
remotes/origin/dependabot/github_actions/actions/checkout-4.1.5
remotes/origin/dependabot/github_actions/actions/checkout-4.1.6
remotes/origin/dependabot/github_actions/actions/checkout-4.2.0
remotes/origin/dependabot/github_actions/actions/upload-artifact-4.3.1
remotes/origin/dependabot/github_actions/rtCamp/action-slack-notify-2.3.1
remotes/origin/dependabot/github_actions/tj-actions/changed-files-44.5.2
remotes/origin/dependabot/github_actions/zama-ai/slab-github-runner-1.4.0
remotes/origin/dependabot/github_actions/zama-ai/slab-github-runner-1dced74825027fe3d481392163ed8fc56813fb5d
remotes/origin/dependabot/github_actions/zama-ai/slab-github-runner-2f49f88318c59d644bb6d329b3259d3ec48e1e34
remotes/origin/dependabot/github_actions/zama-ai/slab-github-runner-da46a715fa5ed427dfd4f3e34126b06923c93231
remotes/origin/dependabot/github_actions/zgosalvez/github-actions-ensure-sha-pinned-actions-3.0.15
remotes/origin/dev
remotes/origin/doc-merge-benchmarks-pages
remotes/origin/doc-proofreading
remotes/origin/doc/0.5.0/gpu
remotes/origin/doc/0.5.0/overflow
remotes/origin/doc/wip0.3.1
remotes/origin/docs-revamp
remotes/origin/dt/bench/bench_80b_integers
remotes/origin/dt/bench/bench_ops_dedup
remotes/origin/dt/bench/big_sizes_blockchain
remotes/origin/dt/bench/crt_ops
remotes/origin/dt/bench/fix_naming_gpu
remotes/origin/dt/bench/fix_signed_scalar_naming
remotes/origin/dt/bench/integer_on_4090
remotes/origin/dt/bench/integer_tuniform_default
remotes/origin/dt/bench/oprf
remotes/origin/dt/bench/other_hardware
remotes/origin/dt/bench/pbs128
remotes/origin/dt/bench/pbs_various_perror
remotes/origin/dt/bench/pbs_with_shortint
remotes/origin/dt/bench/test_compact_keys
remotes/origin/dt/bench/test_parameters_secu_and_pfail
remotes/origin/dt/bench/throughput
remotes/origin/dt/bench/throughput_heuristic
remotes/origin/dt/bench/trivium_app
remotes/origin/dt/bench/wasm_benchs
remotes/origin/dt/chore/bench_full_0.4.1
remotes/origin/dt/chore/update_params
remotes/origin/dt/chore/update_tuniform_params
remotes/origin/dt/ci/bench_gpu_v100
remotes/origin/dt/ci/bench_pfks
remotes/origin/dt/ci/cargo_deny
remotes/origin/dt/ci/change_rust_action
remotes/origin/dt/ci/check_labels
remotes/origin/dt/ci/check_typos
remotes/origin/dt/ci/coverage_tests
remotes/origin/dt/ci/cuda_release
remotes/origin/dt/ci/filter_schedule
remotes/origin/dt/ci/fix_bench_parser
remotes/origin/dt/ci/fix_firefox_wasm
remotes/origin/dt/ci/fix_gpu_integer_bench
remotes/origin/dt/ci/fix_slack_notif
remotes/origin/dt/ci/fix_throughput_bench
remotes/origin/dt/ci/fix_weekly_cuda_bench
remotes/origin/dt/ci/gpu_bench_hyperstack
remotes/origin/dt/ci/gpu_tests_separated
remotes/origin/dt/ci/handle_forks
remotes/origin/dt/ci/improve_core_crypto_coverage
remotes/origin/dt/ci/improve_coverage_core
remotes/origin/dt/ci/integer_coverage
remotes/origin/dt/ci/lattice_estimator_update
remotes/origin/dt/ci/multi_bit_gpu_bench
remotes/origin/dt/ci/no_cancel_workflow
remotes/origin/dt/ci/npm_labels
remotes/origin/dt/ci/pcc_test_gpu
remotes/origin/dt/ci/remove_params
remotes/origin/dt/ci/remove_wasm_test
remotes/origin/dt/ci/slab_gh_runner
remotes/origin/dt/ci/test_multibit_params
remotes/origin/dt/ci/test_on_changes
remotes/origin/dt/ci/test_perfo
remotes/origin/dt/ci/test_slab_action
remotes/origin/dt/ci/update_new_workflows
remotes/origin/dt/ci/use_large_runner
remotes/origin/dt/ci/vector_sum_bench
remotes/origin/dt/doc/gpu_bench_array
remotes/origin/dt/refacto/rename_operations
remotes/origin/dt/test/check_perf
remotes/origin/dt/test/sanity_checks_tuniform
remotes/origin/expose-traits
remotes/origin/feat/as_generalize_gemm_pks_all_params
remotes/origin/feat/cuda/abs
remotes/origin/feat/cuda/multi_bit_tbc_registers
remotes/origin/feat/cuda/new_type_cast
remotes/origin/feat/cuda/new_type_cast_no_round
remotes/origin/feat/cuda/signed_div
remotes/origin/feat/cuda_fft/reduce_sm_io
remotes/origin/feat/gpu/apply_lut_with_factor
remotes/origin/feat/gpu/benchmark
remotes/origin/feat/gpu/cuda_vector_add
remotes/origin/feat/gpu/cufftdx
remotes/origin/feat/gpu/div
remotes/origin/feat/gpu/div_backup
remotes/origin/feat/gpu/div_backup2
remotes/origin/feat/gpu/div_backup2_cleanup
remotes/origin/feat/gpu/div_backup_2
remotes/origin/feat/gpu/div_rem_update_info
remotes/origin/feat/gpu/fft128
remotes/origin/feat/gpu/first_multi_gpu
remotes/origin/feat/gpu/generate_last_block_inner_propagation
remotes/origin/feat/gpu/host_rotate_blocks
remotes/origin/feat/gpu/monokernel_multibitpbs
remotes/origin/feat/gpu/mul_boolean_optimization
remotes/origin/feat/gpu/multi_gpu
remotes/origin/feat/gpu/multi_gpu_buffers
remotes/origin/feat/gpu/overflow_add
remotes/origin/feat/gpu/overflowing_scalar_add_and_sub
remotes/origin/feat/gpu/overflowing_sub
remotes/origin/feat/gpu/propagate_single_carry_get_input_carries
remotes/origin/feat/gpu/refactorchaos
remotes/origin/feat/gpu/reuse_pbs_memory
remotes/origin/feat/gpu/scalar_mul
remotes/origin/feat/gpu/scalareq
remotes/origin/feat/gpu/shift_rotate
remotes/origin/feat/gpu/shifts_one_wave
remotes/origin/feat/gpu/signed
remotes/origin/feat/gpu/signed_bitops
remotes/origin/feat/gpu/signed_comparisons
remotes/origin/feat/gpu/signed_eq_ne
remotes/origin/feat/gpu/signed_if_then_else
remotes/origin/feat/gpu/signed_mul
remotes/origin/feat/gpu/signed_overflowing_add_or_sub
remotes/origin/feat/gpu/signed_overflowing_add_or_sub_with_cuda
remotes/origin/feat/gpu/signed_overflowing_sub
remotes/origin/feat/gpu/signed_scalar_add
remotes/origin/feat/gpu/signed_scalar_bitops
remotes/origin/feat/gpu/signed_scalar_comparisons
remotes/origin/feat/gpu/signed_scalar_shift
remotes/origin/feat/gpu/signed_scalar_sub_mul
remotes/origin/feat/gpu/signedmaxmin
remotes/origin/feat/gpu/sm_multibit
remotes/origin/feat/gpu/tbc
remotes/origin/feat/gpu/threadblockcluster
remotes/origin/feat/gpu/unsigned_cast
remotes/origin/feat/gpu/unsigned_overflowing_scalar_add
remotes/origin/feature/hl_casting_keys_2
remotes/origin/feature/trivium
remotes/origin/fix-addition-for-concrete
remotes/origin/fix-comp
remotes/origin/fix/bench/ks
remotes/origin/fix/bench_parsing
remotes/origin/fix/gpu/fix_terms_degree_bug
remotes/origin/fix/gpu/full_carry_prop_noise_level
remotes/origin/fix/gpu/maxsharedmemory
remotes/origin/fix/gpu/single_carry_propagation
remotes/origin/full-propa
remotes/origin/gitbook-edit
remotes/origin/go/chore/add-gpu-parameters-multibit-gf4
remotes/origin/go/chore/add-lazy-loading-info-to-documentation
remotes/origin/go/chore/add-nvtx-gpu
remotes/origin/go/chore/gpu-new-bitnot
remotes/origin/go/chore/gpu-sampling-tool
remotes/origin/go/chore/implement-is-equal
remotes/origin/go/chore/review-keyswitch
remotes/origin/go/chore/review-opt-choice-on-h100-for-multi-bit-pbs
remotes/origin/go/chore/simple-test-fftgf4
remotes/origin/go/chore/update-classical-params-gpu
remotes/origin/go/chore/update-gpu-parameters
remotes/origin/go/chore/use-new-gpu-params
remotes/origin/go/feature/implement-leading-trailing-zeros-and-one-tests
remotes/origin/go/feature/implement-leading-trailing-zeros-and-ones
remotes/origin/go/feature/implement-many-luts
remotes/origin/go/feature/implement-sub-array-search
remotes/origin/go/feature/implement-subarray-search-gpu
remotes/origin/go/refactor/apply-overlapping-strategy-on-keybundle
remotes/origin/go/refactor/improve-fft-gpu
remotes/origin/go/refactor/improve-full-propagation-and-sum-algorithms
remotes/origin/go/refactor/improve-keybundle-further
remotes/origin/go/refactor/keybundle
remotes/origin/go/refactor/specify-launch-bounds-on-kernels
remotes/origin/go/refactor/understand-classical-pbs-bottlenecks
remotes/origin/go/refactor/use-const-and-restrict-on-gpu
remotes/origin/go/wip/improve-syncs
remotes/origin/go/wip/pfail_gpu
remotes/origin/gpu/hyperstack_ci
remotes/origin/hlapi
remotes/origin/hlapi-1
remotes/origin/hlapi-clean
remotes/origin/hlapi-fhe-bool-cuda
remotes/origin/integer-prelude
remotes/origin/integration/0.6/versioning
remotes/origin/integration/main/versioning
remotes/origin/integration/versioning
remotes/origin/jb/experiments_pke
remotes/origin/jb/tmp
remotes/origin/jkl/agnes-noise_test-branch
remotes/origin/jkl/fft-noise-model
remotes/origin/joc_benches
remotes/origin/kc1212/witness-dim
remotes/origin/main
remotes/origin/multibit-deterministic
remotes/origin/mz/boolean_oprf
remotes/origin/mz/cleanups
remotes/origin/mz/clippy_hl_c_api
remotes/origin/mz/clippy_trivium_bench
remotes/origin/mz/conformnce_proven_cctl
remotes/origin/mz/ct_mod_compression
remotes/origin/mz/destruct_conformance
remotes/origin/mz/doc_prf
remotes/origin/mz/enable_all_doctest_warnings
remotes/origin/mz/fix_ambiguity
remotes/origin/mz/fix_apply_lookup_table_bivariate_check
remotes/origin/mz/fix_clippy_doctests
remotes/origin/mz/fix_fhe_strings
remotes/origin/mz/fix_lint_build
remotes/origin/mz/fix_max_noise_level
remotes/origin/mz/fix_oom
remotes/origin/mz/fix_prf_p_error
remotes/origin/mz/fix_versionning
remotes/origin/mz/glwe_ms_compression
remotes/origin/mz/glwe_packing
remotes/origin/mz/integer_compression_key
remotes/origin/mz/just
remotes/origin/mz/key_conformance
remotes/origin/mz/max_noise_level_woppbs_parameters
remotes/origin/mz/move_strings
remotes/origin/mz/mpi
remotes/origin/mz/mpi_no_priority
remotes/origin/mz/ms_compression
remotes/origin/mz/ms_compression_hl
remotes/origin/mz/ms_compression_multi_bit
remotes/origin/mz/ms_compression_shortint
remotes/origin/mz/ms_compression_shortint_integer
remotes/origin/mz/pack_in_carry
remotes/origin/mz/refacto_wop_params
remotes/origin/mz/refactor_decryption
remotes/origin/mz/regroup_optionals_in_proofs
remotes/origin/mz/remove_noise_CompactCt
remotes/origin/mz/rename_LweCiphertextListParameters
remotes/origin/mz/rename_multi_bit
remotes/origin/mz/shortint/fix_degree
remotes/origin/mz/stabilize_fhe_types
remotes/origin/mz/string_server_key
remotes/origin/mz/string_tests
remotes/origin/mz/strings_function_executor
remotes/origin/mz/test-compat
remotes/origin/mz/test_shortint_lamellar
remotes/origin/mz/update_toolchain
remotes/origin/mz/use_secrets_in_ci
remotes/origin/new-front-bivarfix
remotes/origin/ns/0.6/doc_versioning
remotes/origin/ns/0.6_with_versionize
remotes/origin/ns/backport_0.6/fix_zk_32b
remotes/origin/ns/backward_compat_hl
remotes/origin/ns/bench_tfhe_zk_pok
remotes/origin/ns/bench_zk_cs_ghl
remotes/origin/ns/check_markdown_intra_links
remotes/origin/ns/chore/params_zkv1
remotes/origin/ns/chore/update_node
remotes/origin/ns/clippy_versionable
remotes/origin/ns/feat/integer_slice
remotes/origin/ns/fix_bench_zk_pok
remotes/origin/ns/ignore_data_repo_doc
remotes/origin/ns/refactor_zk_tests
remotes/origin/ns/release/0.7.2
remotes/origin/ns/release_0.6.4
remotes/origin/ns/safe_serialize_api
remotes/origin/ns/test/bisect_pbs_fail
remotes/origin/ns/test/data_automerge
remotes/origin/ns/test/noise_check_perf
remotes/origin/ns/test/noise_check_perf_revert
remotes/origin/ns/test/pr_close
remotes/origin/ns/test_data_lists
remotes/origin/ns/test_rayon_bitslice
remotes/origin/ns/test_type_breaking
remotes/origin/ns/test_zk_bad_noise
remotes/origin/ns/update_arkworks
remotes/origin/ns/update_toolchain_2024-04
remotes/origin/ns/upgrade_data_07
remotes/origin/ns/versionable_unsupported
remotes/origin/ns/versionize_arrays
remotes/origin/ns/versionize_clean
remotes/origin/ns/versionize_transparent
remotes/origin/ns/versionize_zk
remotes/origin/ns/zk_curve_conformance
remotes/origin/ns/zk_test_bad_ct
remotes/origin/ns/zk_test_bad_delta
remotes/origin/ns/zk_test_wasm_x86
remotes/origin/ntt-experiment
remotes/origin/pa/chore/alt_pbs_chunk_size
remotes/origin/pa/chore/cmux
remotes/origin/pa/chore/improve_compression_test
remotes/origin/pa/chore/log2
remotes/origin/pa/chore/pbs
remotes/origin/pa/feat/benchmark_packing_ks
remotes/origin/pa/feat/compression
remotes/origin/pa/fix/cmux
remotes/origin/pa/fix/compression
remotes/origin/pa/fix/compression_alt
remotes/origin/pa/fix/multigpu_compression
remotes/origin/pa/refactor/monokernel_tbc
remotes/origin/pa/refactor/sampleextraction
remotes/origin/readme-restructure
remotes/origin/rebase_trace_packing_keyswitch
remotes/origin/release/0.1.x
remotes/origin/release/0.10.x
remotes/origin/release/0.2.x
remotes/origin/release/0.3.x
remotes/origin/release/0.4.x
remotes/origin/release/0.5.x
remotes/origin/release/0.6.x
remotes/origin/release/0.7.x
remotes/origin/release/0.8.x
remotes/origin/release/0.9.x
remotes/origin/release_pr_stack
remotes/origin/saved/all-decomp-formulas
remotes/origin/saved/am/chore/dep-updates
remotes/origin/saved/am/chore/release/0.7.x
remotes/origin/saved/am/chore/update-wop-parameters
remotes/origin/saved/mul-failure
remotes/origin/sk/feat/batch-pbs
remotes/origin/sk/feat/cuda-fft
remotes/origin/sk/feat/ks-optimization
remotes/origin/sk/feat/ntt-keygen
remotes/origin/sk/feat/ntt-pbs
remotes/origin/sk/feat/perf-modular-arithmetic
remotes/origin/sk/feat/zk-pke-v2
remotes/origin/sk/refactor/fft
remotes/origin/smart-ops-hlapi
remotes/origin/testing_lwe_keyswitch_solinas
remotes/origin/tm/0.6-backports
remotes/origin/tm/better-compact-list
remotes/origin/tm/bind-match
remotes/origin/tm/c-api-error
remotes/origin/tm/compiler-inter-op
remotes/origin/tm/count_ones
remotes/origin/tm/cpk-param-getter
remotes/origin/tm/erc20
remotes/origin/tm/faster-carry-prop
remotes/origin/tm/faster-scalar-add
remotes/origin/tm/faster-shift
remotes/origin/tm/fix-empty-prop
remotes/origin/tm/fix-sub-signed-overflow
remotes/origin/tm/full-prop-clean
remotes/origin/tm/hl-bigger-types
remotes/origin/tm/hlapi-compressed
remotes/origin/tm/hlapi-gpu-tests
remotes/origin/tm/hlstrings
remotes/origin/tm/new-sum
remotes/origin/tm/noise-asserts
remotes/origin/tm/pub-reexports
remotes/origin/tm/reduce-test-per-params
remotes/origin/tm/safer-eq-ne
remotes/origin/tm/scalar-cmps
remotes/origin/tm/shorting-scalar-div
remotes/origin/tm/shortint-encoding
remotes/origin/tm/tensor-hlapi
remotes/origin/tm/test-num-blocks
remotes/origin/tm/unsigned-cmps
remotes/origin/tm/vector-contains-sub
remotes/origin/tm/vector-hlapi
remotes/origin/tm/zk-conincal-serde
remotes/origin/tmp_ccs
remotes/origin/tore/internal-val-access
remotes/origin/trace_packing_keyswitch
remotes/origin/trace_packing_ks_benchmarks
remotes/origin/updated_trace_packing_for_native_ciphertext_modulus
remotes/origin/wip-compile-error-no-release
remotes/origin/yuxizama-patch-1

4
cplx_mul.log Normal file
View File

@@ -0,0 +1,4 @@
-224091083899144574993365467136.000000 15330585716229.072266 224091083972931551288203673600.000000 -15306289712228.677734
-224091083899144574993365467136.00000 15331705505758.05664 224091083899144574993365467136.00000 -15307409501759.66211
224091083899144574993365467136.00000 -15307409501759.66211 224091083899144574993365467136.00000 -15331705505758.05664
224091083899144574993365467136.00000 -15307409501759.66211 224091083899144574993365467136.00000 -15331705505758.05664

115
cpu.log Normal file
View File

@@ -0,0 +1,115 @@
-0.3309726757578757561084614735591458156704902648925781250000000000000000000000000000000000000000000000 -0.0000000000000000262161441032774636130079215314589595089813221258498096566391666328854626044631004333 0.0380704474245701987578094360742397839203476905822753906250000000000000000000000000000000000000000000 -0.0000000000000000009616181682828339618607862290825046798652604462695437856356583949946070788428187370
-0.1992581793808917978161332484887680038809776306152343750000000000000000000000000000000000000000000000 -0.0000000000000000075339409234980193504176368576269248248756960902950199909788864260917762294411659241 0.0176522535578225135688423819146919413469731807708740234375000000000000000000000000000000000000000000 -0.0000000000000000011079942189849169212186822638096097214445589711721970246904866996828786795958876610
0.2260550165243644937795863825158448889851570129394531250000000000000000000000000000000000000000000000 0.0000000000000000126022745915227752500212969900318988585096703503442110150256638689825194887816905975 -0.0799689927978955578113939850481983739882707595825195312500000000000000000000000000000000000000000000 -0.0000000000000000063460023733636283936840017401873121454740939053999759789093104700441472232341766357
0.2426504064692105999689886175474384799599647521972656250000000000000000000000000000000000000000000000 0.0000000000000000004677977136086304752259424595325139275587325315104779248354116560904003563337028027 0.0365333979932580613092518717621715040877461433410644531250000000000000000000000000000000000000000000 0.0000000000000000005709344770022019545359771749790997852063776745568752721271543748571275500580668449
0.0914749448999244568758726359192223753780126571655273437500000000000000000000000000000000000000000000 -0.0000000000000000036609340702821524448717294490586665457459787183420747008755213869335420895367860794 0.2786308216747127586288002021319698542356491088867187500000000000000000000000000000000000000000000000 0.0000000000000000041379918916404281616895809751333352103206111649103329797050321303686359897255897522
-0.0124926024286186147793653589133100467734038829803466796875000000000000000000000000000000000000000000 -0.0000000000000000006937190272456474979519993342187765906642550802765293666007906381310021970421075821 0.2110491012472701377600259320388431660830974578857421875000000000000000000000000000000000000000000000 -0.0000000000000000051785532317672362668742834231741586121061746167014080977075707323820097371935844421
-0.0494172631142162574469089975082169985398650169372558593750000000000000000000000000000000000000000000 0.0000000000000000020700342936472428610421033900271927805735103256831817347682900276595319155603647232 -0.2029124738196469157625045909298933111131191253662109375000000000000000000000000000000000000000000000 0.0000000000000000032892946516813893505437931152017216374523186022500384484423818776122061535716056824
0.0315092344667192991725102046984829939901828765869140625000000000000000000000000000000000000000000000 0.0000000000000000026511341643412592805126523014407753588898695525533522748817816250266332644969224930 -0.2988676928669205534738750884571345522999763488769531250000000000000000000000000000000000000000000000 -0.0000000000000000223519185626510679759809968042136173580417081359914335814131902679946506395936012268
-0.1672368467298269156806611590582178905606269836425781250000000000000000000000000000000000000000000000 -0.0000000000000000048162869081133674839304134386709639116236975270920334102342508231231477111577987671 0.2727747087939790260335826133086811751127243041992187500000000000000000000000000000000000000000000000 -0.0000000000000000085823320193215261882172073504002543651078135111842128124326478655348182655870914459
-0.1420260232345169348899815986442263238131999969482421875000000000000000000000000000000000000000000000 0.0000000000000000078353130752317586555795127350300028590782462991921192285715846992388833314180374146 0.1452463415301429028314572633462375961244106292724609375000000000000000000000000000000000000000000000 -0.0000000000000000130538715628498043839887390457914535898550603454414467674560285104234935715794563293
0.1033287219076595153666175974649377167224884033203125000000000000000000000000000000000000000000000000 0.0000000000000000065787245791957383370689888773812031301831047645908498444633494273148244246840476990 -0.1956877991027157481784826131843146868050098419189453125000000000000000000000000000000000000000000000 -0.0000000000000000108434329174532879698514604527317036512444788844025493623757228078829939477145671844
0.2183523172540651480222351210613851435482501983642578125000000000000000000000000000000000000000000000 -0.0000000000000000042952334406807644211423904079848498142548976906260858876773767178747220896184444427 -0.1611283900775630495427748201109352521598339080810546875000000000000000000000000000000000000000000000 0.0000000000000000049035193854080055673983422185833393998012569040749301929604797578576835803687572479
0.2397036106352415651965515053234412334859371185302734375000000000000000000000000000000000000000000000 -0.0000000000000000052953977110488860650721110650147839714434454694985676681540098797995597124099731445 0.1115723403910117567106041747138078790158033370971679687500000000000000000000000000000000000000000000 -0.0000000000000000050113374213532207659401750409473001366666780484688051289232646468008169904351234436
0.1398900907499183998705660769701353274285793304443359375000000000000000000000000000000000000000000000 0.0000000000000000025854512863426880011882823985481452679392879801315019427931929385522380471229553223 0.1766487032839706705633631145246908999979496002197265625000000000000000000000000000000000000000000000 0.0000000000000000047294440057263980645363900766920231270194547596580608023186798050119250547140836716
-0.1644190411459663658799712493419065140187740325927734375000000000000000000000000000000000000000000000 -0.0000000000000000061410496854399099844472379531271533022912420237207631323883560980902984738349914551 -0.1171702953286619941497548325060051865875720977783203125000000000000000000000000000000000000000000000 0.0000000000000000018800082941866710238088777334985347003193960609764186220349557743247714824974536896
-0.2271417216519029391719186605769209563732147216796875000000000000000000000000000000000000000000000000 -0.0000000000000000003410581758850931755798386799378379375470085294421492688732300280207709874957799911 -0.2324424613666220273966445120095158927142620086669921875000000000000000000000000000000000000000000000 0.0000000000000000052705161220801475140341690423664438031522828137140877780009162734131678007543087006
-0.2768098499713873561489663188694976270198822021484375000000000000000000000000000000000000000000000000 0.0000000000000000271839307303537356402336171088715461786860875246282093220528963684046175330877304077 0.1777761056245066584313718749399413354694843292236328125000000000000000000000000000000000000000000000 0.0000000000000000031436349524061540422500698511335631041026594357601451484862664642605523113161325455
-0.1816506822808992283047047067157109268009662628173828125000000000000000000000000000000000000000000000 -0.0000000000000000091744046880454127720279131854420783978544864776692377428268798666977090761065483093 0.0859671095812945679215388850025192368775606155395507812500000000000000000000000000000000000000000000 -0.0000000000000000019790221122024815278620829931698194201352001468739022493892321108432952314615249634
0.1733917003068170814561455017610569484531879425048828125000000000000000000000000000000000000000000000 -0.0000000000000000002355669226462113621703291114551109740391745850677726603628059365291846916079521179 -0.1507412427231122065141022403622628189623355865478515625000000000000000000000000000000000000000000000 -0.0000000000000000050074860604269868780685029482838904707624616828879823837716855905455304309725761414
0.2504400589351173378460657659161370247602462768554687500000000000000000000000000000000000000000000000 -0.0000000000000000105495846440776700797075650436594384534827739945562347090302068863820750266313552856 -0.0608093817831304034315920148401346523314714431762695312500000000000000000000000000000000000000000000 -0.0000000000000000006649734888645786681087537662594592372951811360499638214882622833101777359843254089
0.1862075376662823411599845258024288341403007507324218750000000000000000000000000000000000000000000000 0.0000000000000000092738192415434320391903987162228447654735493063692976234424492076868773438036441803 0.2071792938507401871106594626326113939285278320312500000000000000000000000000000000000000000000000000 0.0000000000000000037279901254371484709180590427832381461048739621779671976620207374253368470817804337
0.0649597768658507629391252180539595428854227066040039062500000000000000000000000000000000000000000000 -0.0000000000000000000599891608510439561166750358405470603488327470983847156116297583139385096728801727 0.2077132880219826616130518459613085724413394927978515625000000000000000000000000000000000000000000000 0.0000000000000000109365476335817531496040397320749442280186865305083299126920337585033848881721496582
-0.1149666930854301022391084075024991761893033981323242187500000000000000000000000000000000000000000000 -0.0000000000000000046719677244762014686309320217246739870689257903384195014240276577766053378582000732 -0.1693829849301254908411351607355754822492599487304687500000000000000000000000000000000000000000000000 0.0000000000000000019290098218370279959780502759588964735884392347577739185160439205901639070361852646
-0.1013849860233902999695487778808455914258956909179687500000000000000000000000000000000000000000000000 -0.0000000000000000010958363504292690122307333050960276846345426518287025779985199847033072728663682938 -0.2972510798575715429414856316725490614771842956542968750000000000000000000000000000000000000000000000 0.0000000000000000151060681535435571571128080956451898860237590587266010011546768510015681385993957520
-0.0338490332315699790632912424825917696580290794372558593750000000000000000000000000000000000000000000 -0.0000000000000000032112051133747497270867287155696078651722246333124338724118729260226245969533920288 0.3056514274567712630137350515724392607808113098144531250000000000000000000000000000000000000000000000 -0.0000000000000000084660885661200502048130129231798305394126263491287077977975883413819246925413608551
-0.0838544664005262951889108080649748444557189941406250000000000000000000000000000000000000000000000000 -0.0000000000000000051800512586075403991650040606243722019399192612806870350539156788727268576622009277 0.1888297106619587217846145676958258263766765594482421875000000000000000000000000000000000000000000000 0.0000000000000000125710005198254852176108184946779873729544425121429709890641390757082263007760047913
0.0259785877288864451573058289568507461808621883392333984375000000000000000000000000000000000000000000 0.0000000000000000003808995729306562665530126889433911280323679558962660096632646400394150987267494202 -0.2127655593344512374720522984716808423399925231933593750000000000000000000000000000000000000000000000 -0.0000000000000000019935625096555455247881784143722497167520067235186101428645599753508577123284339905
0.1435981506670927776436741396537399850785732269287109375000000000000000000000000000000000000000000000 0.0000000000000000131126020340999405450862544727246234289488926316605624244893135710299247875809669495 -0.2472170630286313064427616836837842129170894622802734375000000000000000000000000000000000000000000000 -0.0000000000000000122808089510788868829075694614557771638870714467047309814695665863837348297238349915
0.2511343731265572865041235672833863645792007446289062500000000000000000000000000000000000000000000000 0.0000000000000000092452280298070285367329115251744832217129003532020067868124613141844747588038444519 0.0108656958033641493771570551984950725454837083816528320312500000000000000000000000000000000000000000 -0.0000000000000000002947488139454235324126363051764372718819140216213133855183148313017227337695658207
0.2025455324622073560725254992576083168387413024902343750000000000000000000000000000000000000000000000 0.0000000000000000092268658403218626649354055433850145426009984179994624642340284026431618258357048035 0.1183192308221643201937567368986492510885000228881835937500000000000000000000000000000000000000000000 -0.0000000000000000033112219949499888538350744600518247336266158650688439663012374580830510240048170090
-0.1933471753078349342214181660892791114747524261474609375000000000000000000000000000000000000000000000 0.0000000000000000056708946471137426758117517411030692033211668293211232816375400034303311258554458618 -0.0524965901614871180425048180495650740340352058410644531250000000000000000000000000000000000000000000 0.0000000000000000026002120264691327416023942568955159965262954086960469485267921641025168355554342270
-0.3123928209210611317558914379333145916461944580078125000000000000000000000000000000000000000000000000 0.0000000000000000259207635313622839687841546274592684607857754166323344113820326128916349261999130249 -0.1116379705409853706488121360962395556271076202392578125000000000000000000000000000000000000000000000 0.0000000000000000066179487727938613466908117740857190639118025083514895712255565740633755922317504883
32768
340282366920938463463374607431768170496
340282366920938463463374607431768080384
340282366920938463463374607431768023040
340282366920938463463374607431768092672
340282366920938463463374607431768047616
340282366920938463463374607431767556096
71168
277888
1262592
4018176
16760832
67108864
268304384
1074266112
4295099124
17180026739
68719574034
274877809558
1099511650955
4398046581374
17592185995320
70368743494672
281474976801920
1125899906931200
4503599627216896
18014398509543424
72057594037780480
288230376151842816
1152921504606584832
4611686018426339328
18446744073709455341
73786976294838222848
295147905179352834048
1180591620717411139584
4722366482869645363200
18889465931478580777984
75557863725914323349504
302231454903657293758464
1208925819614629174814720
4835703278458516698808960
19342813113834066795323392
77371252455336267180941312
309485009821345068724764672
1237940039285380274898993152
4951760157141521099596627968
19807040628566084398385463296
79228162514264337593544017586
316912650057057350374175694614
1267650600228229401496703230278
5070602400912917605986813129648
20282409603651670423947251205115
81129638414606681695789005047098
324518553658426726783156020669176
1298074214633706907132624082931600
5192296858534827628530496329081216
20769187434139310514121985316977152
83076749736557242056487941267441664
332306998946228968225951765070360576
1329227995784915872903807060280262656
5316911983139663491615228241121247232
21267647932558653966460912964485251072
85070591730234615865843651857941004288
58358
test core_crypto::fft_impl::fft128::math::fft::tests::test_roundtrip_u128 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 345 filtered out; finished in 0.01s
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 211 filtered out; finished in 0.01s

221
gpu.log Normal file
View File

@@ -0,0 +1,221 @@
0.7071067811865475727373109293694142252206802368164062500000000000000000000 -0.0000000000000000483364665672645672552734986488347382506490835271488470948 0.7071067811865475727373109293694142252206802368164062500000000000000000000 -0.0000000000000000483364665672645672552734986488347382506490835271488470948
0.9238795325112867384831361050601117312908172607421875000000000000000000000 0.0000000000000000176450470843366770599569725809896737820548554530070833879 0.3826834323650897817792326804919866845011711120605468750000000000000000000 -0.0000000000000000100507726964615876116866635215079604149716565502162279877
-0.3826834323650897817792326804919866845011711120605468750000000000000000000 0.0000000000000000100507726964615876116866635215079604149716565502162279877 0.9238795325112867384831361050601117312908172607421875000000000000000000000 0.0000000000000000176450470843366770599569725809896737820548554530070833879
0.9807852804032304305792422383092343807220458984375000000000000000000000000 0.0000000000000000185469399978250057259083323855314298012062932205111162598 0.1950903220161282758393639369387528859078884124755859375000000000000000000 -0.0000000000000000079910790684617312634428970369307223484248965070806183797
-0.1950903220161282758393639369387528859078884124755859375000000000000000000 0.0000000000000000079910790684617312634428970369307223484248965070806183797 0.9807852804032304305792422383092343807220458984375000000000000000000000000 0.0000000000000000185469399978250057259083323855314298012062932205111162598
0.5555702330196021776487214083317667245864868164062500000000000000000000000 0.0000000000000000470941094056167682138404887426404929202505139640422271441 0.8314696123025452356714026791451033204793930053710937500000000000000000000 0.0000000000000000014073856984728023893078524226353211699334975107286878349
-0.8314696123025452356714026791451033204793930053710937500000000000000000000 -0.0000000000000000014073856984728023893078524226353211699334975107286878349 0.5555702330196021776487214083317667245864868164062500000000000000000000000 0.0000000000000000470941094056167682138404887426404929202505139640422271441
0.9951847266721969287317506314138881862163543701171875000000000000000000000 -0.0000000000000000424869136783044157591677787910035744642531197693552380557 0.0980171403295606036287779261328978464007377624511718750000000000000000000 -0.0000000000000000016345823622442606095622288086432645091240042271949622910
-0.0980171403295606036287779261328978464007377624511718750000000000000000000 0.0000000000000000016345823622442606095622288086432645091240042271949622910 0.9951847266721969287317506314138881862163543701171875000000000000000000000 -0.0000000000000000424869136783044157591677787910035744642531197693552380557
0.6343932841636454877942696839454583823680877685546875000000000000000000000 0.0000000000000000104209019292800237913543214649613418305006117285573669085 0.7730104533627369933768136434082407504320144653320312500000000000000000000 -0.0000000000000000325659070336497908244691644422883020460214305129287293283
-0.7730104533627369933768136434082407504320144653320312500000000000000000000 0.0000000000000000325659070336497908244691644422883020460214305129287293283 0.6343932841636454877942696839454583823680877685546875000000000000000000000 0.0000000000000000104209019292800237913543214649613418305006117285573669085
0.8819212643483550495560052695509511977434158325195312500000000000000000000 -0.0000000000000000198432484058905683070896551742611855256204741109827063839 0.4713967368259976420397094898362411186099052429199218750000000000000000000 0.0000000000000000065166781360690129644720854136527256367167864498925970024
-0.4713967368259976420397094898362411186099052429199218750000000000000000000 -0.0000000000000000065166781360690129644720854136527256367167864498925970024 0.8819212643483550495560052695509511977434158325195312500000000000000000000 -0.0000000000000000198432484058905683070896551742611855256204741109827063839
0.2902846772544623865641710835916455835103988647460937500000000000000000000 -0.0000000000000000189279787077742607055779713593855819062153334180614860927 0.9569403357322088243819280251045711338520050048828125000000000000000000000 0.0000000000000000405538698618756882243805365526526472088078930809326716478
-0.9569403357322088243819280251045711338520050048828125000000000000000000000 -0.0000000000000000405538698618756882243805365526526472088078930809326716478 0.2902846772544623865641710835916455835103988647460937500000000000000000000 -0.0000000000000000189279787077742607055779713593855819062153334180614860927
0.9987954562051724050064649418345652520656585693359375000000000000000000000 -0.0000000000000000122916933370754648023069228195440464413706083235768885820 0.0490676743274180149345653489945107139647006988525390625000000000000000000 -0.0000000000000000006796103720518276261450983336612892861148698896541419864
-0.0490676743274180149345653489945107139647006988525390625000000000000000000 0.0000000000000000006796103720518276261450983336612892861148698896541419864 0.9987954562051724050064649418345652520656585693359375000000000000000000000 -0.0000000000000000122916933370754648023069228195440464413706083235768885820
0.6715589548470184411144145997241139411926269531250000000000000000000000000 -0.0000000000000000404890377492966801319705884961725462258731885432927466439 0.7409511253549591058842338497925084084272384643554687500000000000000000000 -0.0000000000000000147086169522973421017060338640487283358362467376636434357
-0.7409511253549591058842338497925084084272384643554687500000000000000000000 0.0000000000000000147086169522973421017060338640487283358362467376636434357 0.6715589548470184411144145997241139411926269531250000000000000000000000000 -0.0000000000000000404890377492966801319705884961725462258731885432927466439
0.9039892931234433381959547659789677709341049194335937500000000000000000000 -0.0000000000000000066097544687484293087601477313978651325255137129839264065 0.4275550934302820849097770405933260917663574218750000000000000000000000000 0.0000000000000000094111898162954679947731789440894191883937120943204313051
-0.4275550934302820849097770405933260917663574218750000000000000000000000000 -0.0000000000000000094111898162954679947731789440894191883937120943204313051 0.9039892931234433381959547659789677709341049194335937500000000000000000000 -0.0000000000000000066097544687484293087601477313978651325255137129839264065
0.3368898533922200511092626129538984969258308410644531250000000000000000000 -0.0000000000000000004200094003347528494601431516305399984771095349699076074 0.9415440651830208063088889502978418022394180297851562500000000000000000000 -0.0000000000000000278963795476983349094016418681950862291293519244599063978
-0.9415440651830208063088889502978418022394180297851562500000000000000000000 0.0000000000000000278963795476983349094016418681950862291293519244599063978 0.3368898533922200511092626129538984969258308410644531250000000000000000000 -0.0000000000000000004200094003347528494601431516305399984771095349699076074
0.9700312531945439742386838588572572916746139526367187500000000000000000000 0.0000000000000000183653003484288443908920092041031891826103053310748416949 0.2429801799032638986997056917971349321305751800537109375000000000000000000 -0.0000000000000000087514315297196631565769806659755604546308109536762592784
-0.2429801799032638986997056917971349321305751800537109375000000000000000000 0.0000000000000000087514315297196631565769806659755604546308109536762592784 0.9700312531945439742386838588572572916746139526367187500000000000000000000 0.0000000000000000183653003484288443908920092041031891826103053310748416949
0.5141027441932217723064013625844381749629974365234375000000000000000000000 -0.0000000000000000457127075236156239511828925399355339402327136821488351437 0.8577286100002721180857179206213913857936859130859375000000000000000000000 -0.0000000000000000481834479363366263073139653840536793747977248347259815464
-0.8577286100002721180857179206213913857936859130859375000000000000000000000 0.0000000000000000481834479363366263073139653840536793747977248347259815464 0.5141027441932217723064013625844381749629974365234375000000000000000000000 -0.0000000000000000457127075236156239511828925399355339402327136821488351437
0.8032075314806449428672863177780527621507644653320312500000000000000000000 -0.0000000000000000330606098048149096139985697961182567790418192774440364623 0.5956993044924333569056784654094371944665908813476562500000000000000000000 -0.0000000000000000134386419365794626148451101058230490536144850885393736695
-0.5956993044924333569056784654094371944665908813476562500000000000000000000 0.0000000000000000134386419365794626148451101058230490536144850885393736695 0.8032075314806449428672863177780527621507644653320312500000000000000000000 -0.0000000000000000330606098048149096139985697961182567790418192774440364623
0.1467304744553617479319029826001496985554695129394531250000000000000000000 0.0000000000000000037269471470465677476267642910816717181318621877547658805 0.9891765099647810144389836750633548945188522338867187500000000000000000000 -0.0000000000000000409873099370471113819595571800653989851994072636780924723
-0.9891765099647810144389836750633548945188522338867187500000000000000000000 0.0000000000000000409873099370471113819595571800653989851994072636780924723 0.1467304744553617479319029826001496985554695129394531250000000000000000000 0.0000000000000000037269471470465677476267642910816717181318621877547658805
sizeof_scalar: 128
poly.size(): 64
before cuda api: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000000000000000000000000000000000000000000000000000000000000
0.7071067811865475727373109293694142252206802368164062500000000000000000000 -0.0000000000000000483364665672645672552734986488347382506490835271488470948 0.7071067811865475727373109293694142252206802368164062500000000000000000000 -0.0000000000000000483364665672645672552734986488347382506490835271488470948
0.9238795325112867384831361050601117312908172607421875000000000000000000000 0.0000000000000000176450470843366770599569725809896737820548554530070833879 0.3826834323650897817792326804919866845011711120605468750000000000000000000 -0.0000000000000000100507726964615876116866635215079604149716565502162279877
-0.3826834323650897817792326804919866845011711120605468750000000000000000000 0.0000000000000000100507726964615876116866635215079604149716565502162279877 0.9238795325112867384831361050601117312908172607421875000000000000000000000 0.0000000000000000176450470843366770599569725809896737820548554530070833879
0.9807852804032304305792422383092343807220458984375000000000000000000000000 0.0000000000000000185469399978250057259083323855314298012062932205111162598 0.1950903220161282758393639369387528859078884124755859375000000000000000000 -0.0000000000000000079910790684617312634428970369307223484248965070806183797
-0.1950903220161282758393639369387528859078884124755859375000000000000000000 0.0000000000000000079910790684617312634428970369307223484248965070806183797 0.9807852804032304305792422383092343807220458984375000000000000000000000000 0.0000000000000000185469399978250057259083323855314298012062932205111162598
0.5555702330196021776487214083317667245864868164062500000000000000000000000 0.0000000000000000470941094056167682138404887426404929202505139640422271441 0.8314696123025452356714026791451033204793930053710937500000000000000000000 0.0000000000000000014073856984728023893078524226353211699334975107286878349
-0.8314696123025452356714026791451033204793930053710937500000000000000000000 -0.0000000000000000014073856984728023893078524226353211699334975107286878349 0.5555702330196021776487214083317667245864868164062500000000000000000000000 0.0000000000000000470941094056167682138404887426404929202505139640422271441
0.9951847266721969287317506314138881862163543701171875000000000000000000000 -0.0000000000000000424869136783044157591677787910035744642531197693552380557 0.0980171403295606036287779261328978464007377624511718750000000000000000000 -0.0000000000000000016345823622442606095622288086432645091240042271949622910
-0.0980171403295606036287779261328978464007377624511718750000000000000000000 0.0000000000000000016345823622442606095622288086432645091240042271949622910 0.9951847266721969287317506314138881862163543701171875000000000000000000000 -0.0000000000000000424869136783044157591677787910035744642531197693552380557
0.6343932841636454877942696839454583823680877685546875000000000000000000000 0.0000000000000000104209019292800237913543214649613418305006117285573669085 0.7730104533627369933768136434082407504320144653320312500000000000000000000 -0.0000000000000000325659070336497908244691644422883020460214305129287293283
-0.7730104533627369933768136434082407504320144653320312500000000000000000000 0.0000000000000000325659070336497908244691644422883020460214305129287293283 0.6343932841636454877942696839454583823680877685546875000000000000000000000 0.0000000000000000104209019292800237913543214649613418305006117285573669085
0.8819212643483550495560052695509511977434158325195312500000000000000000000 -0.0000000000000000198432484058905683070896551742611855256204741109827063839 0.4713967368259976420397094898362411186099052429199218750000000000000000000 0.0000000000000000065166781360690129644720854136527256367167864498925970024
-0.4713967368259976420397094898362411186099052429199218750000000000000000000 -0.0000000000000000065166781360690129644720854136527256367167864498925970024 0.8819212643483550495560052695509511977434158325195312500000000000000000000 -0.0000000000000000198432484058905683070896551742611855256204741109827063839
0.2902846772544623865641710835916455835103988647460937500000000000000000000 -0.0000000000000000189279787077742607055779713593855819062153334180614860927 0.9569403357322088243819280251045711338520050048828125000000000000000000000 0.0000000000000000405538698618756882243805365526526472088078930809326716478
-0.9569403357322088243819280251045711338520050048828125000000000000000000000 -0.0000000000000000405538698618756882243805365526526472088078930809326716478 0.2902846772544623865641710835916455835103988647460937500000000000000000000 -0.0000000000000000189279787077742607055779713593855819062153334180614860927
0.9987954562051724050064649418345652520656585693359375000000000000000000000 -0.0000000000000000122916933370754648023069228195440464413706083235768885820 0.0490676743274180149345653489945107139647006988525390625000000000000000000 -0.0000000000000000006796103720518276261450983336612892861148698896541419864
-0.0490676743274180149345653489945107139647006988525390625000000000000000000 0.0000000000000000006796103720518276261450983336612892861148698896541419864 0.9987954562051724050064649418345652520656585693359375000000000000000000000 -0.0000000000000000122916933370754648023069228195440464413706083235768885820
0.6715589548470184411144145997241139411926269531250000000000000000000000000 -0.0000000000000000404890377492966801319705884961725462258731885432927466439 0.7409511253549591058842338497925084084272384643554687500000000000000000000 -0.0000000000000000147086169522973421017060338640487283358362467376636434357
-0.7409511253549591058842338497925084084272384643554687500000000000000000000 0.0000000000000000147086169522973421017060338640487283358362467376636434357 0.6715589548470184411144145997241139411926269531250000000000000000000000000 -0.0000000000000000404890377492966801319705884961725462258731885432927466439
0.9039892931234433381959547659789677709341049194335937500000000000000000000 -0.0000000000000000066097544687484293087601477313978651325255137129839264065 0.4275550934302820849097770405933260917663574218750000000000000000000000000 0.0000000000000000094111898162954679947731789440894191883937120943204313051
-0.4275550934302820849097770405933260917663574218750000000000000000000000000 -0.0000000000000000094111898162954679947731789440894191883937120943204313051 0.9039892931234433381959547659789677709341049194335937500000000000000000000 -0.0000000000000000066097544687484293087601477313978651325255137129839264065
0.3368898533922200511092626129538984969258308410644531250000000000000000000 -0.0000000000000000004200094003347528494601431516305399984771095349699076074 0.9415440651830208063088889502978418022394180297851562500000000000000000000 -0.0000000000000000278963795476983349094016418681950862291293519244599063978
-0.9415440651830208063088889502978418022394180297851562500000000000000000000 0.0000000000000000278963795476983349094016418681950862291293519244599063978 0.3368898533922200511092626129538984969258308410644531250000000000000000000 -0.0000000000000000004200094003347528494601431516305399984771095349699076074
0.9700312531945439742386838588572572916746139526367187500000000000000000000 0.0000000000000000183653003484288443908920092041031891826103053310748416949 0.2429801799032638986997056917971349321305751800537109375000000000000000000 -0.0000000000000000087514315297196631565769806659755604546308109536762592784
-0.2429801799032638986997056917971349321305751800537109375000000000000000000 0.0000000000000000087514315297196631565769806659755604546308109536762592784 0.9700312531945439742386838588572572916746139526367187500000000000000000000 0.0000000000000000183653003484288443908920092041031891826103053310748416949
0.5141027441932217723064013625844381749629974365234375000000000000000000000 -0.0000000000000000457127075236156239511828925399355339402327136821488351437 0.8577286100002721180857179206213913857936859130859375000000000000000000000 -0.0000000000000000481834479363366263073139653840536793747977248347259815464
-0.8577286100002721180857179206213913857936859130859375000000000000000000000 0.0000000000000000481834479363366263073139653840536793747977248347259815464 0.5141027441932217723064013625844381749629974365234375000000000000000000000 -0.0000000000000000457127075236156239511828925399355339402327136821488351437
0.8032075314806449428672863177780527621507644653320312500000000000000000000 -0.0000000000000000330606098048149096139985697961182567790418192774440364623 0.5956993044924333569056784654094371944665908813476562500000000000000000000 -0.0000000000000000134386419365794626148451101058230490536144850885393736695
-0.5956993044924333569056784654094371944665908813476562500000000000000000000 0.0000000000000000134386419365794626148451101058230490536144850885393736695 0.8032075314806449428672863177780527621507644653320312500000000000000000000 -0.0000000000000000330606098048149096139985697961182567790418192774440364623
0.1467304744553617479319029826001496985554695129394531250000000000000000000 0.0000000000000000037269471470465677476267642910816717181318621877547658805 0.9891765099647810144389836750633548945188522338867187500000000000000000000 -0.0000000000000000409873099370471113819595571800653989851994072636780924723
-0.9891765099647810144389836750633548945188522338867187500000000000000000000 0.0000000000000000409873099370471113819595571800653989851994072636780924723 0.1467304744553617479319029826001496985554695129394531250000000000000000000 0.0000000000000000037269471470465677476267642910816717181318621877547658805
0.0000000000000000000000000000000000000088162076311671563097655240291668426 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000002168404344971008868014905601739883422851562500000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000000000440810381558357815488276201458342129 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000008673617379884035472059622406959533691406250000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000000001851403602545102825050760046125036943 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000034694469519536141888238489627838134765625000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000000007493776486492082863300695424791816196 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000138777878078144567552953958511352539062500000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000000030063268022280003016300436939458933210 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000000555111512312578270211815834045410156250000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000000120341234165431683628299402998127401267 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000002220446049250313080847263336181640625000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000000481453098738038406076295267232801273493 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000008881784197001252323389053344726562500000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000001925900557028465295868278724171496762396 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000035527136788005009293556213378906250000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000007703690390190172855036212551926278718010 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000142108547152020037174224853515625000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000030814849722837003091707947862945406540468 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000000568434188608080148696899414062500000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000123259487053424324038394889107021917830297 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000002273736754432320594787597656250000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000000493038036375773607825142654083327962989612 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000009094947017729282379150390625000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000001972152233665170742972133713988552143626874 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000036379788070917129516601562500000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000007888609022822759283560097953609448866175922 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000145519152283668518066406250000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000031554436179453113445911954912093035756372115 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000000582076609134674072265625000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000126217744805974530095319382746027383317156888 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000002328306436538696289062500000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000000504870979312060196692949094081764773560295977 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000009313225746154785156250000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000002019483917336402863083467939424714334532852332 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000037252902984619140625000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000008077935669433773528645543320796512578423077754 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000149011611938476562500000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000032311742677823256190893844846283705553983979440 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000000596046447753906250000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000129246970711381186839887050948232477456227586186 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000002384185791015625000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000000516987882845612909435859875356027565065202013171 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000009536743164062500000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000002067951531382539799819751172987207915501099721110 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000038146972656250000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000008271806125530247361355316363511929317244690552867 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000152587890625000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000033087224502121077607497577125610814924219053879893 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0000610351562500000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000132348898008484398592066620174006357352116507187999 0.0000000000000000000000000000000000000000000000000000000000000000000000000 0.0002441406250000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000000529395592033937711917701562924776226282119750976562 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0009765625000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000002117582368135750847670806251699104905128479003906250 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0039062500000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000008470329472543003390683225006796419620513916015625000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0156250000000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000033881317890172013562732900027185678482055664062500000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0625000000000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000135525271560688054250931600108742713928222656250000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.2500000000000000000000000000000000000000000000000000000000000000000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142
0.0000000000000000000542101086242752217003726400434970855712890625000000000 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 -0.0000000000000000000000000000000000000029387358770557187699218413430556142 0.0000000000000000000000000000000000000000000000000000000000000000000000000
Params::degree 64
opt 4
BUTTERFLY_DEPTH 1
LOG2_DEGREE 5
HALF_DEGREE 16
STRIDE 16
Params::degree 32
opt 2
-0.3309726757578757561084614735591458156704902648925781250000000000000000000000000000000000000000000000 -0.0000000000000000262161441032774636130079215314589595089813221258498096566391666328854626044631004333 0.0380704474245701987578094360742397839203476905822753906250000000000000000000000000000000000000000000 -0.0000000000000000009616181682828339618607862290825046798652604462695437856356583949946070788428187370
-0.1992581793808917978161332484887680038809776306152343750000000000000000000000000000000000000000000000 -0.0000000000000000075339409234980193504176368576269248248756960902950199909788864260917762294411659241 0.0176522535578225135688423819146919413469731807708740234375000000000000000000000000000000000000000000 -0.0000000000000000011079942189849169212186822638096097214445589711721970246904866996828786795958876610
0.2260550165243644937795863825158448889851570129394531250000000000000000000000000000000000000000000000 0.0000000000000000126022745915227752500212969900318988585096703503442110150256638689825194887816905975 -0.0799689927978955578113939850481983739882707595825195312500000000000000000000000000000000000000000000 -0.0000000000000000063460023733636283936840017401873121454740939053999759789093104700441472232341766357
0.2426504064692105999689886175474384799599647521972656250000000000000000000000000000000000000000000000 0.0000000000000000004677977136086304752259424595325139275587325315104779248354116560904003563337028027 0.0365333979932580613092518717621715040877461433410644531250000000000000000000000000000000000000000000 0.0000000000000000005709344770022019545359771749790997852063776745568752721271543748571275500580668449
0.0914749448999244568758726359192223753780126571655273437500000000000000000000000000000000000000000000 -0.0000000000000000036609340702821524448717294490586665457459787183420747008755213869335420895367860794 0.2786308216747127586288002021319698542356491088867187500000000000000000000000000000000000000000000000 0.0000000000000000041379918916404281616895809751333352103206111649103329797050321303686359897255897522
-0.0124926024286186147793653589133100467734038829803466796875000000000000000000000000000000000000000000 -0.0000000000000000006937190272456474979519993342187765906642550802765293666007906381310021970421075821 0.2110491012472701377600259320388431660830974578857421875000000000000000000000000000000000000000000000 -0.0000000000000000051785532317672362668742834231741586121061746167014080977075707323820097371935844421
-0.0494172631142162574469089975082169985398650169372558593750000000000000000000000000000000000000000000 0.0000000000000000020700342936472428610421033900271927805735103256831817347682900276595319155603647232 -0.2029124738196469157625045909298933111131191253662109375000000000000000000000000000000000000000000000 0.0000000000000000032892946516813893505437931152017216374523186022500384484423818776122061535716056824
0.0315092344667192991725102046984829939901828765869140625000000000000000000000000000000000000000000000 0.0000000000000000026511341643412592805126523014407753588898695525533522748817816250266332644969224930 -0.2988676928669205534738750884571345522999763488769531250000000000000000000000000000000000000000000000 -0.0000000000000000223519185626510679759809968042136173580417081359914335814131902679946506395936012268
-0.1672368467298269156806611590582178905606269836425781250000000000000000000000000000000000000000000000 -0.0000000000000000048162869081133674839304134386709639116236975270920334102342508231231477111577987671 0.2727747087939790260335826133086811751127243041992187500000000000000000000000000000000000000000000000 -0.0000000000000000085823320193215261882172073504002543651078135111842128124326478655348182655870914459
-0.1420260232345169348899815986442263238131999969482421875000000000000000000000000000000000000000000000 0.0000000000000000078353130752317586555795127350300028590782462991921192285715846992388833314180374146 0.1452463415301429028314572633462375961244106292724609375000000000000000000000000000000000000000000000 -0.0000000000000000130538715628498043839887390457914535898550603454414467674560285104234935715794563293
0.1033287219076595153666175974649377167224884033203125000000000000000000000000000000000000000000000000 0.0000000000000000065787245791957383370689888773812031301831047645908498444633494273148244246840476990 -0.1956877991027157481784826131843146868050098419189453125000000000000000000000000000000000000000000000 -0.0000000000000000108434329174532879698514604527317036512444788844025493623757228078829939477145671844
0.2183523172540651480222351210613851435482501983642578125000000000000000000000000000000000000000000000 -0.0000000000000000042952334406807644211423904079848498142548976906260858876773767178747220896184444427 -0.1611283900775630495427748201109352521598339080810546875000000000000000000000000000000000000000000000 0.0000000000000000049035193854080055673983422185833393998012569040749301929604797578576835803687572479
0.2397036106352415651965515053234412334859371185302734375000000000000000000000000000000000000000000000 -0.0000000000000000052953977110488860650721110650147839714434454694985676681540098797995597124099731445 0.1115723403910117567106041747138078790158033370971679687500000000000000000000000000000000000000000000 -0.0000000000000000050113374213532207659401750409473001366666780484688051289232646468008169904351234436
0.1398900907499183998705660769701353274285793304443359375000000000000000000000000000000000000000000000 0.0000000000000000025854512863426880011882823985481452679392879801315019427931929385522380471229553223 0.1766487032839706705633631145246908999979496002197265625000000000000000000000000000000000000000000000 0.0000000000000000047294440057263980645363900766920231270194547596580608023186798050119250547140836716
-0.1644190411459663658799712493419065140187740325927734375000000000000000000000000000000000000000000000 -0.0000000000000000061410496854399099844472379531271533022912420237207631323883560980902984738349914551 -0.1171702953286619941497548325060051865875720977783203125000000000000000000000000000000000000000000000 0.0000000000000000018800082941866710238088777334985347003193960609764186220349557743247714824974536896
-0.2271417216519029391719186605769209563732147216796875000000000000000000000000000000000000000000000000 -0.0000000000000000003410581758850931755798386799378379375470085294421492688732300280207709874957799911 -0.2324424613666220273966445120095158927142620086669921875000000000000000000000000000000000000000000000 0.0000000000000000052705161220801475140341690423664438031522828137140877780009162734131678007543087006
-0.2768098499713873561489663188694976270198822021484375000000000000000000000000000000000000000000000000 0.0000000000000000271839307303537356402336171088715461786860875246282093220528963684046175330877304077 0.1777761056245066584313718749399413354694843292236328125000000000000000000000000000000000000000000000 0.0000000000000000031436349524061540422500698511335631041026594357601451484862664642605523113161325455
-0.1816506822808992283047047067157109268009662628173828125000000000000000000000000000000000000000000000 -0.0000000000000000091744046880454127720279131854420783978544864776692377428268798666977090761065483093 0.0859671095812945679215388850025192368775606155395507812500000000000000000000000000000000000000000000 -0.0000000000000000019790221122024815278620829931698194201352001468739022493892321108432952314615249634
0.1733917003068170814561455017610569484531879425048828125000000000000000000000000000000000000000000000 -0.0000000000000000002355669226462113621703291114551109740391745850677726603628059365291846916079521179 -0.1507412427231122065141022403622628189623355865478515625000000000000000000000000000000000000000000000 -0.0000000000000000050074860604269868780685029482838904707624616828879823837716855905455304309725761414
0.2504400589351173378460657659161370247602462768554687500000000000000000000000000000000000000000000000 -0.0000000000000000105495846440776700797075650436594384534827739945562347090302068863820750266313552856 -0.0608093817831304034315920148401346523314714431762695312500000000000000000000000000000000000000000000 -0.0000000000000000006649734888645786681087537662594592372951811360499638214882622833101777359843254089
0.1862075376662823411599845258024288341403007507324218750000000000000000000000000000000000000000000000 0.0000000000000000092738192415434320391903987162228447654735493063692976234424492076868773438036441803 0.2071792938507401871106594626326113939285278320312500000000000000000000000000000000000000000000000000 0.0000000000000000037279901254371484709180590427832381461048739621779671976620207374253368470817804337
0.0649597768658507629391252180539595428854227066040039062500000000000000000000000000000000000000000000 -0.0000000000000000000599891608510439561166750358405470603488327470983847156116297583139385096728801727 0.2077132880219826616130518459613085724413394927978515625000000000000000000000000000000000000000000000 0.0000000000000000109365476335817531496040397320749442280186865305083299126920337585033848881721496582
-0.1149666930854301022391084075024991761893033981323242187500000000000000000000000000000000000000000000 -0.0000000000000000046719677244762014686309320217246739870689257903384195014240276577766053378582000732 -0.1693829849301254908411351607355754822492599487304687500000000000000000000000000000000000000000000000 0.0000000000000000019290098218370279959780502759588964735884392347577739185160439205901639070361852646
-0.1013849860233902999695487778808455914258956909179687500000000000000000000000000000000000000000000000 -0.0000000000000000010958363504292690122307333050960276846345426518287025779985199847033072728663682938 -0.2972510798575715429414856316725490614771842956542968750000000000000000000000000000000000000000000000 0.0000000000000000151060681535435571571128080956451898860237590587266010011546768510015681385993957520
-0.0338490332315699790632912424825917696580290794372558593750000000000000000000000000000000000000000000 -0.0000000000000000032112051133747497270867287155696078651722246333124338724118729260226245969533920288 0.3056514274567712630137350515724392607808113098144531250000000000000000000000000000000000000000000000 -0.0000000000000000084660885661200502048130129231798305394126263491287077977975883413819246925413608551
-0.0838544664005262951889108080649748444557189941406250000000000000000000000000000000000000000000000000 -0.0000000000000000051800512586075403991650040606243722019399192612806870350539156788727268576622009277 0.1888297106619587217846145676958258263766765594482421875000000000000000000000000000000000000000000000 0.0000000000000000125710005198254852176108184946779873729544425121429709890641390757082263007760047913
0.0259785877288864451573058289568507461808621883392333984375000000000000000000000000000000000000000000 0.0000000000000000003808995729306562665530126889433911280323679558962660096632646400394150987267494202 -0.2127655593344512374720522984716808423399925231933593750000000000000000000000000000000000000000000000 -0.0000000000000000019935625096555455247881784143722497167520067235186101428645599753508577123284339905
0.1435981506670927776436741396537399850785732269287109375000000000000000000000000000000000000000000000 0.0000000000000000131126020340999405450862544727246234289488926316605624244893135710299247875809669495 -0.2472170630286313064427616836837842129170894622802734375000000000000000000000000000000000000000000000 -0.0000000000000000122808089510788868829075694614557771638870714467047309814695665863837348297238349915
0.2511343731265572865041235672833863645792007446289062500000000000000000000000000000000000000000000000 0.0000000000000000092452280298070285367329115251744832217129003532020067868124613141844747588038444519 0.0108656958033641493771570551984950725454837083816528320312500000000000000000000000000000000000000000 -0.0000000000000000002947488139454235324126363051764372718819140216213133855183148313017227337695658207
0.2025455324622073560725254992576083168387413024902343750000000000000000000000000000000000000000000000 0.0000000000000000092268658403218626649354055433850145426009984179994624642340284026431618258357048035 0.1183192308221643201937567368986492510885000228881835937500000000000000000000000000000000000000000000 -0.0000000000000000033112219949499888538350744600518247336266158650688439663012374580830510240048170090
-0.1933471753078349342214181660892791114747524261474609375000000000000000000000000000000000000000000000 0.0000000000000000056708946471137426758117517411030692033211668293211232816375400034303311258554458618 -0.0524965901614871180425048180495650740340352058410644531250000000000000000000000000000000000000000000 0.0000000000000000026002120264691327416023942568955159965262954086960469485267921641025168355554342270
-0.3123928209210611317558914379333145916461944580078125000000000000000000000000000000000000000000000000 0.0000000000000000259207635313622839687841546274592684607857754166323344113820326128916349261999130249 -0.1116379705409853706488121360962395556271076202392578125000000000000000000000000000000000000000000000 0.0000000000000000066179487727938613466908117740857190639118025083514895712255565740633755922317504883
32768
340282366920938463463374607431768170496
340282366920938463463374607431768080384
340282366920938463463374607431768023040
340282366920938463463374607431768092672
340282366920938463463374607431768047616
340282366920938463463374607431767556096
71168
277888
1262592
4018176
16760832
67108864
268304384
1074266112
4295099124
17180026739
68719574034
274877809558
1099511650955
4398046581374
17592185995320
70368743494672
281474976801920
1125899906931200
4503599627216896
18014398509543424
72057594037780480
288230376151842816
1152921504606584832
4611686018426339328
18446744073709455341
73786976294838222848
295147905179352834048
1180591620717411139584
4722366482869645363200
18889465931478580777984
75557863725914323349504
302231454903657293758464
1208925819614629174814720
4835703278458516698808960
19342813113834066795323392
77371252455336267180941312
309485009821345068724764672
1237940039285380274898993152
4951760157141521099596627968
19807040628566084398385463296
79228162514264337593544017586
316912650057057350374175694614
1267650600228229401496703230278
5070602400912917605986813129648
20282409603651670423947251205115
81129638414606681695789005047098
324518553658426726783156020669176
1298074214633706907132624082931600
5192296858534827628530496329081216
20769187434139310514121985316977152
83076749736557242056487941267441664
332306998946228968225951765070360576
1329227995784915872903807060280262656
5316911983139663491615228241121247232
21267647932558653966460912964485251072
85070591730234615865843651857941004288
58358
test core_crypto::gpu::algorithms::test::fft::test_roundtrip_u128 ... ok
test result: ok. 1 passed; 0 failed; 0 ignored; 0 measured; 354 filtered out; finished in 0.19s
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 0 filtered out; finished in 0.00s
running 0 tests
test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 200 filtered out; finished in 0.01s

0
gpu.logeset Normal file
View File

68
parse_f128_twiddles.py Normal file
View File

@@ -0,0 +1,68 @@
#!/usr/bin/env python3
import sys
def parse_file(filename):
with open(filename, 'r') as f:
# Read all non-empty lines (you can also filter out comments if needed)
lines = [line.strip() for line in f if line.strip()]
# The first line is expected to be the count (e.g. "4096")
try:
count = int(lines[0])
except ValueError:
sys.exit("Error: The first line must be an integer (the count of data lines).")
# Initialize lists for each twiddle array.
neg_twiddles_re_hi = []
neg_twiddles_re_lo = []
neg_twiddles_im_hi = []
neg_twiddles_im_lo = []
# Process each subsequent line.
for i, line in enumerate(lines[1:], start=1):
tokens = line.split()
if len(tokens) != 4:
sys.exit(f"Error on line {i+1}: expected 4 tokens, found {len(tokens)}.")
neg_twiddles_re_hi.append(tokens[0])
neg_twiddles_re_lo.append(tokens[1])
neg_twiddles_im_hi.append(tokens[2])
neg_twiddles_im_lo.append(tokens[3])
if len(neg_twiddles_re_hi) != count:
print(f"Warning: Count mismatch. Expected {count} entries but found {len(neg_twiddles_re_hi)}.")
count = len(neg_twiddles_re_hi) # adjust count to the actual number of data lines
return count, neg_twiddles_re_hi, neg_twiddles_re_lo, neg_twiddles_im_hi, neg_twiddles_im_lo
def print_cpp_array(name, count, values, indent=4, per_line=4):
indent_str = " " * indent
print(f"__device__ double {name}[{count}] = {{")
for i, val in enumerate(values):
# Print a newline every 'per_line' entries.
if i % per_line == 0:
print(indent_str, end="")
print(val, end="")
if i != len(values) - 1:
print(", ", end="")
if (i + 1) % per_line == 0:
print("")
# If the last line wasn't completed, print a newline.
if len(values) % per_line != 0:
print("")
print("};\n")
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python3 generate_twiddles.py <input_file>")
filename = sys.argv[1]
count, re_hi, re_lo, im_hi, im_lo = parse_file(filename)
# Generate C++ arrays.
print_cpp_array("neg_twiddles_re_hi", count, re_hi)
print_cpp_array("neg_twiddles_re_lo", count, re_lo)
print_cpp_array("neg_twiddles_im_hi", count, im_hi)
print_cpp_array("neg_twiddles_im_lo", count, im_lo)
if __name__ == "__main__":
main()

4099
parsed_log.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -35,8 +35,14 @@ fn two_diff(a: f64, b: f64) -> (f64, f64) {
#[inline(always)]
fn two_prod(a: f64, b: f64) -> (f64, f64) {
// println!("two_prod");
// println!("a: {:?}", a);
// println!("b: {:?}", b);
let p = a * b;
(p, f64::mul_add(a, b, -p))
// println!("p: {:?}", p);
let p2 = f64::mul_add(a, b, -p);
// println!("p2: {:?}", p2);
(p, p2)
}
use core::{
@@ -393,9 +399,21 @@ impl f128 {
/// Multiplies `a` and `b` and returns the result.
#[inline(always)]
pub fn mul_f128_f128(a: f128, b: f128) -> Self {
// println!("mul_f128_f128");
// println!("a: {:?}", a);
// println!("b: {:?}", b);
let (p1, p2) = two_prod(a.0, b.0);
// println!("p1: {:?}", p1);
// println!("p2: {:?}", p2);
let p2 = p2 + (a.0 * b.1 + a.1 * b.0);
// println!("p2: {:?}", p2);
// println!("a.0 * b.1: {:?}", a.0 * b.1);
// println!("a.1 * b.0: {:?}", a.1 * b.0);
let (p1, p2) = quick_two_sum(p1, p2);
// println!("p1: {:?}", p1);
// println!("p2: {:?}", p2);
Self(p1, p2)
}

View File

@@ -183,6 +183,7 @@ impl FftSimdF128 for Scalar {
#[inline(always)]
fn mul(self, a: (Self::Reg, Self::Reg), b: (Self::Reg, Self::Reg)) -> (Self::Reg, Self::Reg) {
println!("mul for scalar");
let f128(o0, o1) = f128(a.0, a.1) * f128(b.0, b.1);
(o0, o1)
}
@@ -316,15 +317,60 @@ trait FftSimdF128Ext: FftSimdF128 {
b_re: (Self::Reg, Self::Reg),
b_im: (Self::Reg, Self::Reg),
) -> ((Self::Reg, Self::Reg), (Self::Reg, Self::Reg)) {
println!("fftSimdF123::cplx_mul");
let a_re_x_b_re = self.mul(a_re, b_re);
let a_re_x_b_im = self.mul(a_re, b_im);
let a_im_x_b_re = self.mul(a_im, b_re);
let a_im_x_b_im = self.mul(a_im, b_im);
(
let a_re_0: f64 = unsafe { std::mem::transmute_copy(&a_re.0) };
let a_re_1: f64 = unsafe { std::mem::transmute_copy(&a_re.1) };
let a_im_0: f64 = unsafe { std::mem::transmute_copy(&a_im.0) };
let a_im_1: f64 = unsafe { std::mem::transmute_copy(&a_im.1) };
let b_re_0: f64 = unsafe { std::mem::transmute_copy(&b_re.0) };
let b_re_1: f64 = unsafe { std::mem::transmute_copy(&b_re.1) };
let b_im_0: f64 = unsafe { std::mem::transmute_copy(&b_im.0) };
let b_im_1: f64 = unsafe { std::mem::transmute_copy(&b_im.1) };
let a_re_x_b_re_0: f64 = unsafe { std::mem::transmute_copy(&a_re_x_b_re.0) };
let a_re_x_b_re_1: f64 = unsafe { std::mem::transmute_copy(&a_re_x_b_re.1) };
let a_re_x_b_im_0: f64 = unsafe { std::mem::transmute_copy(&a_re_x_b_im.0) };
let a_re_x_b_im_1: f64 = unsafe { std::mem::transmute_copy(&a_re_x_b_im.1) };
let a_im_x_b_re_0: f64 = unsafe { std::mem::transmute_copy(&a_im_x_b_re.0) };
let a_im_x_b_re_1: f64 = unsafe { std::mem::transmute_copy(&a_im_x_b_re.1) };
let a_im_x_b_im_0: f64 = unsafe { std::mem::transmute_copy(&a_im_x_b_im.0) };
let a_im_x_b_im_1: f64 = unsafe { std::mem::transmute_copy(&a_im_x_b_im.1) };
println!("a_re: {:.100} {:.100}", a_re_0, a_re_1);
println!("a_im: {:.100} {:.100}", a_im_0, a_im_1);
println!("b_re: {:.100} {:.100}", b_re_0, b_re_1);
println!("b_im: {:.100} {:.100}", b_im_0, b_im_1);
println!("a_re_x_b_re: {:.100} {:.100}", a_re_x_b_re_0, a_re_x_b_re_1);
println!("a_re_x_b_im: {:.100} {:.100}", a_re_x_b_im_0, a_re_x_b_im_1);
println!("a_im_x_b_re: {:.100} {:.100}", a_im_x_b_re_0, a_im_x_b_re_1);
println!("a_im_x_b_im: {:.100} {:.100}", a_im_x_b_im_0, a_im_x_b_im_1);
let (c_re, c_im) = (
self.sub(a_re_x_b_re, a_im_x_b_im),
self.add(a_im_x_b_re, a_re_x_b_im),
)
);
let c_re_0: f64 = unsafe { std::mem::transmute_copy(&c_re.0) };
let c_re_1: f64 = unsafe { std::mem::transmute_copy(&c_re.1) };
let c_im_0: f64 = unsafe { std::mem::transmute_copy(&c_im.0) };
let c_im_1: f64 = unsafe { std::mem::transmute_copy(&c_im.1) };
println!("a_re: {:.100} {:.100}", c_re_0, c_re_1);
println!("a_im: {:.100} {:.100}", c_im_0, c_im_1);
(c_re, c_im)
}
/// `a * conj(b)`
@@ -366,7 +412,9 @@ pub fn negacyclic_fwd_fft_scalar(
let mut m = 1;
let simd = Scalar;
let mut ii = 0;
while m < n {
ii = ii + 1;
t /= 2;
for i in 0..m {
@@ -390,6 +438,7 @@ pub fn negacyclic_fwd_fft_scalar(
{
let (z0_re, z0_im) = ((*z0_re0, *z0_re1), (*z0_im0, *z0_im1));
let (z1_re, z1_im) = ((*z1_re0, *z1_re1), (*z1_im0, *z1_im1));
println!("{:?} {:?}", w1_re, w1_im);
let (z1w_re, z1w_im) = simd.cplx_mul(z1_re, z1_im, w1_re, w1_im);
((*z0_re0, *z0_re1), (*z0_im0, *z0_im1)) =
@@ -398,6 +447,7 @@ pub fn negacyclic_fwd_fft_scalar(
simd.cplx_sub(z0_re, z0_im, z1w_re, z1w_im);
}
}
//break;
m *= 2;
}
@@ -1050,22 +1100,22 @@ pub fn negacyclic_fwd_fft(
twid_im0: &[f64],
twid_im1: &[f64],
) {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
#[cfg(feature = "nightly")]
if let Some(simd) = V4::try_new() {
return negacyclic_fwd_fft_avx512(
simd, data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0,
twid_im1,
);
}
if let Some(simd) = V3::try_new() {
return negacyclic_fwd_fft_avxfma(
simd, data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0,
twid_im1,
);
}
}
// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
// {
// #[cfg(feature = "nightly")]
// if let Some(simd) = V4::try_new() {
// return negacyclic_fwd_fft_avx512(
// simd, data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0,
// twid_im1,
// );
// }
// if let Some(simd) = V3::try_new() {
// return negacyclic_fwd_fft_avxfma(
// simd, data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0,
// twid_im1,
// );
// }
// }
negacyclic_fwd_fft_scalar(
data_re0, data_re1, data_im0, data_im1, twid_re0, twid_re1, twid_im0, twid_im1,
)
@@ -1803,6 +1853,47 @@ fn bitreverse(i: usize, n: usize) -> usize {
result
}
fn f64_to_hexfloat(f: f64) -> String {
// Special cases.
if f.is_nan() {
return "NaN".to_string();
}
if f.is_infinite() {
return if f.is_sign_positive() {
"inf".to_string()
} else {
"-inf".to_string()
};
}
if f == 0.0 {
// Handle both +0.0 and -0.0.
return if f.is_sign_negative() {
"-0x0p+0".to_string()
} else {
"0x0p+0".to_string()
};
}
let bits = f.to_bits();
let sign = if bits >> 63 == 1 { "-" } else { "" };
let exponent_bits = ((bits >> 52) & 0x7FF) as i32;
let fraction = bits & ((1u64 << 52) - 1);
// For normalized numbers the printed form is "0x1.<fraction>p<exp>".
// There are 52 fraction bits so printing 13 hex digits (since 13 * 4 = 52)
// captures the entire fractional part.
if exponent_bits != 0 {
let frac_str = format!("{:013x}", fraction);
let exp = exponent_bits - 1023;
return format!("{}0x1.{}p{:+}", sign, frac_str, exp);
} else {
// Subnormal numbers: no implicit leading 1.
let frac_str = format!("{:013x}", fraction);
// For subnormals the effective exponent is fixed at -1022.
return format!("{}0x0.{}p-1022", sign, frac_str);
}
}
#[doc(hidden)]
pub fn init_negacyclic_twiddles(
twid_re0: &mut [f64],
@@ -1824,6 +1915,10 @@ pub fn init_negacyclic_twiddles(
twid_re1[pos] = c.1;
twid_im0[pos] = s.0;
twid_im1[pos] = s.1;
println!("{:.73} {:.73} {:.73} {:.73}", c.0, c.1, s.0, s.1);
// println!("{} {} {} {}", f64_to_hexfloat(c.0), f64_to_hexfloat(c.1),
// f64_to_hexfloat(s.0), f64_to_hexfloat(s.1));
}
m *= 2;
}

View File

@@ -140,6 +140,7 @@ pub fn u128_to_f64(x: u128) -> f64 {
const B: f64 = (1u128 << 104) as f64;
const C: f64 = (1u128 << 76) as f64;
const D: f64 = u128::MAX as f64;
if x < 1 << 104 {
let l = f64::from_bits(A.to_bits() | ((x << 12) as u64 >> 12)) - A;
let h = f64::from_bits(B.to_bits() | (x >> 52) as u64) - B;

View File

@@ -5,7 +5,7 @@ use dyn_stack::{GlobalPodBuffer, ReborrowMut};
fn test_roundtrip<Scalar: UnsignedTorus>() {
let mut generator = new_random_generator();
for size_log in 6..=14 {
for size_log in 6..=6 {
let size = 1_usize << size_log;
let fourier_size = PolynomialSize(size).to_fourier_polynomial_size().0;
@@ -19,13 +19,42 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
let mut fourier_im0 = avec![0.0f64; fourier_size].into_boxed_slice();
let mut fourier_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
println!("sizeof_scalar: {:?}", Scalar::BITS);
println!("poly.size(): {:?}", poly.len());
let mut base = Scalar::TWO * Scalar::TWO;
let mut exp = base;
for x in poly.as_mut().iter_mut() {
*x = generator.random_uniform();
*x = base - Scalar::ONE;
base = base * exp;
}
let n = poly.len();
// for coef in poly.iter() {
// println!("{:?}", coef);
// }
let mut mem = GlobalPodBuffer::new(fft.backward_scratch().unwrap());
let mut stack = PodStack::new(&mut mem);
let (standard_re, standard_im) = poly.split_at(n / 2);
convert_forward_torus(
&mut fourier_re0,
&mut fourier_re1,
&mut fourier_im0,
&mut fourier_im1,
&standard_re,
&standard_im,
);
for ((re0, re1), (im0, im1)) in (&*fourier_re0)
.iter()
.zip((&*fourier_re1).iter())
.zip((&*fourier_im0).iter().zip((&*fourier_im1).iter()))
{
println!("{:.73} {:.73} {:.73} {:.73}", re0, re1, im0, im1);
}
fft.forward_as_torus(
&mut fourier_re0,
&mut fourier_re1,
@@ -33,6 +62,15 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
&mut fourier_im1,
&poly,
);
for ((re0, re1), (im0, im1)) in (&*fourier_re0)
.iter()
.zip((&*fourier_re1).iter())
.zip((&*fourier_im0).iter().zip((&*fourier_im1).iter()))
{
println!("{:.100} {:.100} {:.100} {:.100}", re0, re1, im0, im1);
}
fft.backward_as_torus(
&mut roundtrip,
&fourier_re0,
@@ -42,6 +80,10 @@ fn test_roundtrip<Scalar: UnsignedTorus>() {
stack.rb_mut(),
);
for coefficient in roundtrip.iter() {
println!("{:?}", coefficient);
}
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
if Scalar::BITS <= 64 {
assert_eq!(*expected, *actual);

View File

@@ -0,0 +1,274 @@
use super::*;
use crate::core_crypto::commons::test_tools::{modular_distance, new_random_generator};
use crate::core_crypto::commons::utils::izip;
use crate::core_crypto::gpu::{
fourier_transform_forward_as_integer_f128_async, fourier_transform_forward_as_torus_f128_async,
CudaStreams,
};
use aligned_vec::avec;
use dyn_stack::{GlobalPodBuffer, PodStack, ReborrowMut};
use tfhe_cuda_backend::cuda_bind::cuda_synchronize_device;
fn test_roundtrip<Scalar: UnsignedTorus>() {
let mut generator = new_random_generator();
for size_log in 6..=6 {
let size = 1_usize << size_log;
let fourier_size = PolynomialSize(size).to_fourier_polynomial_size().0;
let fft = Fft128::new(PolynomialSize(size));
let fft = fft.as_view();
let mut poly = avec![Scalar::ZERO; size].into_boxed_slice();
let mut roundtrip = avec![Scalar::ZERO; size].into_boxed_slice();
let mut fourier_re0 = avec![0.0f64; fourier_size].into_boxed_slice();
let mut fourier_re1 = avec![0.0f64; fourier_size].into_boxed_slice();
let mut fourier_im0 = avec![0.0f64; fourier_size].into_boxed_slice();
let mut fourier_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
if poly.len() > 64 {
return;
}
println!("sizeof_scalar: {:?}", Scalar::BITS);
println!("poly.size(): {:?}", poly.len());
let mut base = Scalar::TWO * Scalar::TWO;
let mut exp = base;
for x in poly.as_mut().iter_mut() {
*x = base - Scalar::ONE;
base = base * exp;
}
let mut mem = GlobalPodBuffer::new(fft.backward_scratch().unwrap());
let mut stack = PodStack::new(&mut mem);
// fft.forward_as_torus(
// &mut fourier_re0,
// &mut fourier_re1,
// &mut fourier_im0,
// &mut fourier_im1,
// &poly,
// );
let gpu_index = 0;
let stream = CudaStreams::new_single_gpu(gpu_index);
unsafe {
// println!("size: {:?}", size);
// println!("poly.len: {:?}", poly.len());
// println!("rust poly");
// for coefficient in poly.iter() {
// println!(
// "{:0width$b}",
// coefficient,
// width = std::mem::size_of::<Scalar>() * 8
// );
// }
// let mut rl = 0.;
// for mut coef in fourier_im1.iter_mut() {
// *coef = rl;
// rl = rl + 1.;
// }
fourier_transform_forward_as_torus_f128_async(
&stream,
&mut fourier_re0,
&mut fourier_re1,
&mut fourier_im0,
&mut fourier_im1,
&poly,
poly.len() as u32,
1,
);
}
unsafe {
cuda_synchronize_device(0);
}
for ((re0, re1), (im0, im1)) in (&*fourier_re0)
.iter()
.zip((&*fourier_re1).iter())
.zip((&*fourier_im0).iter().zip((&*fourier_im1).iter()))
{
println!("{:.100} {:.100} {:.100} {:.100}", re0, re1, im0, im1);
}
fft.backward_as_torus(
&mut roundtrip,
&fourier_re0,
&fourier_re1,
&fourier_im0,
&fourier_im1,
stack.rb_mut(),
);
for coefficient in roundtrip.iter() {
println!("{:?}", coefficient);
}
for (expected, actual) in izip!(poly.as_ref().iter(), roundtrip.as_ref().iter()) {
if Scalar::BITS <= 64 {
assert_eq!(*expected, *actual);
} else {
let abs_diff = modular_distance(*expected, *actual);
let threshold = Scalar::ONE << (128 - 100);
assert!(
abs_diff < threshold,
"abs_diff: {abs_diff}, threshold: {threshold}",
);
}
}
}
}
// fn test_product<Scalar: UnsignedTorus>() {
// fn convolution_naive<Scalar: UnsignedTorus>(
// out: &mut [Scalar],
// lhs: &[Scalar],
// rhs: &[Scalar],
// ) {
// assert_eq!(out.len(), lhs.len());
// assert_eq!(out.len(), rhs.len());
// let n = out.len();
// let mut full_prod = vec![Scalar::ZERO; 2 * n];
// for i in 0..n {
// for j in 0..n {
// full_prod[i + j] = full_prod[i + j].wrapping_add(lhs[i].wrapping_mul(rhs[j]));
// }
// }
// for i in 0..n {
// out[i] = full_prod[i].wrapping_sub(full_prod[i + n]);
// }
// }
//
// let mut generator = new_random_generator();
// for size_log in 6..=14 {
// for _ in 0..10 {
// let size = 1_usize << size_log;
// let fourier_size = PolynomialSize(size).to_fourier_polynomial_size().0;
//
// let fft = Fft128::new(PolynomialSize(size));
// let fft = fft.as_view();
//
// let mut poly0 = avec![Scalar::ZERO; size].into_boxed_slice();
// let mut poly1 = avec![Scalar::ZERO; size].into_boxed_slice();
//
// let mut convolution_from_fft = avec![Scalar::ZERO; size].into_boxed_slice();
// let mut convolution_from_naive = avec![Scalar::ZERO; size].into_boxed_slice();
//
// let mut fourier0_re0 = avec![0.0f64; fourier_size].into_boxed_slice();
// let mut fourier0_re1 = avec![0.0f64; fourier_size].into_boxed_slice();
// let mut fourier0_im0 = avec![0.0f64; fourier_size].into_boxed_slice();
// let mut fourier0_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
//
// let mut fourier1_re0 = avec![0.0f64; fourier_size].into_boxed_slice();
// let mut fourier1_re1 = avec![0.0f64; fourier_size].into_boxed_slice();
// let mut fourier1_im0 = avec![0.0f64; fourier_size].into_boxed_slice();
// let mut fourier1_im1 = avec![0.0f64; fourier_size].into_boxed_slice();
//
// let integer_magnitude = 16;
// for (x, y) in izip!(poly0.as_mut().iter_mut(), poly1.as_mut().iter_mut()) {
// *x = generator.random_uniform();
// *y = generator.random_uniform();
//
// *y >>= Scalar::BITS - integer_magnitude;
// }
//
// let mut mem = GlobalPodBuffer::new(fft.backward_scratch().unwrap());
// let mut stack = PodStack::new(&mut mem);
//
// fft.forward_as_torus(
// &mut fourier0_re0,
// &mut fourier0_re1,
// &mut fourier0_im0,
// &mut fourier0_im1,
// &poly0,
// );
// fft.forward_as_integer(
// &mut fourier1_re0,
// &mut fourier1_re1,
// &mut fourier1_im0,
// &mut fourier1_im1,
// &poly1,
// );
//
// for (f0_re0, f0_re1, f0_im0, f0_im1, f1_re0, f1_re1, f1_im0, f1_im1) in izip!(
// &mut *fourier0_re0,
// &mut *fourier0_re1,
// &mut *fourier0_im0,
// &mut *fourier0_im1,
// &*fourier1_re0,
// &*fourier1_re1,
// &*fourier1_im0,
// &*fourier1_im1,
// ) {
// let f0_re = f128(*f0_re0, *f0_re1);
// let f0_im = f128(*f0_im0, *f0_im1);
// let f1_re = f128(*f1_re0, *f1_re1);
// let f1_im = f128(*f1_im0, *f1_im1);
//
// f128(*f0_re0, *f0_re1) = f0_re * f1_re - f0_im * f1_im;
// f128(*f0_im0, *f0_im1) = f0_im * f1_re + f0_re * f1_im;
// }
//
// fft.backward_as_torus(
// &mut convolution_from_fft,
// &fourier0_re0,
// &fourier0_re1,
// &fourier0_im0,
// &fourier0_im1,
// stack.rb_mut(),
// );
// convolution_naive(
// convolution_from_naive.as_mut(),
// poly0.as_ref(),
// poly1.as_ref(),
// );
//
// for (expected, actual) in izip!(
// convolution_from_naive.as_ref().iter(),
// convolution_from_fft.as_ref().iter()
// ) {
// let threshold = Scalar::ONE
// << (Scalar::BITS.saturating_sub(100 - integer_magnitude - size_log));
// let abs_diff = modular_distance(*expected, *actual);
// assert!(
// abs_diff <= threshold,
// "abs_diff: {abs_diff}, threshold: {threshold}",
// );
// }
// }
// }
// }
// #[test]
// fn test_roundtrip_u32() {
// test_roundtrip::<u32>();
// }
// #[test]
// fn test_roundtrip_u64() {
// test_roundtrip::<u64>();
// }
#[test]
fn test_roundtrip_u128() {
test_roundtrip::<u128>();
}
// fn test_roundtrip_u128<
// Scalar: UnsignedTorus + Sync + Send + CastFrom<usize> + CastInto<usize>,
// >(
// params: ClassicTestParams<Scalar>,
// ) {
// test_roundtrip::<u128>();
// }
// create_gpu_parametrized_test!(test_roundtrip_u128);
// #[test]
// fn test_product_u32() {
// test_product::<u32>();
// }
//
// #[test]
// fn test_product_u64() {
// test_product::<u64>();
// }
//
// #[test]
// fn test_product_u128() {
// test_product::<u128>();
// }

View File

@@ -1,5 +1,6 @@
use crate::core_crypto::algorithms::test::*;
mod fft;
mod glwe_sample_extraction;
mod lwe_keyswitch;
mod lwe_linear_algebra;

View File

@@ -630,6 +630,56 @@ pub unsafe fn mult_lwe_ciphertext_vector_cleartext_vector<T: UnsignedInteger>(
);
}
#[allow(clippy::too_many_arguments)]
pub unsafe fn fourier_transform_forward_as_integer_f128_async<T: UnsignedInteger>(
streams: &CudaStreams,
re0: &mut [f64],
re1: &mut [f64],
im0: &mut [f64],
im1: &mut [f64],
standard: &[T],
fft_size: u32,
number_of_samples: u32,
) {
println!("before cuda api: {:?}", re0);
cuda_fourier_transform_forward_as_integer_f128_async(
streams.ptr[0],
streams.gpu_indexes[0],
re0.as_mut_ptr() as *mut c_void,
re1.as_mut_ptr() as *mut c_void,
im0.as_mut_ptr() as *mut c_void,
im1.as_mut_ptr() as *mut c_void,
standard.as_ptr() as *const c_void,
fft_size,
number_of_samples,
);
}
#[allow(clippy::too_many_arguments)]
pub unsafe fn fourier_transform_forward_as_torus_f128_async<T: UnsignedInteger>(
streams: &CudaStreams,
re0: &mut [f64],
re1: &mut [f64],
im0: &mut [f64],
im1: &mut [f64],
standard: &[T],
fft_size: u32,
number_of_samples: u32,
) {
println!("before cuda api: {:?}", re0);
cuda_fourier_transform_forward_as_torus_f128_async(
streams.ptr[0],
streams.gpu_indexes[0],
re0.as_mut_ptr() as *mut c_void,
re1.as_mut_ptr() as *mut c_void,
im0.as_mut_ptr() as *mut c_void,
im1.as_mut_ptr() as *mut c_void,
standard.as_ptr() as *const c_void,
fft_size,
number_of_samples,
);
}
#[derive(Debug)]
pub struct CudaLweList<T: UnsignedInteger> {
// Pointer to GPU data

12293
twids.log Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff