refactor(cuda): remove PolynomialFourier

This commit is contained in:
Agnes Leroy
2022-10-21 10:07:04 +02:00
committed by Agnès Leroy
parent 5160931a69
commit 17312bbd52
4 changed files with 21 additions and 174 deletions

View File

@@ -187,20 +187,20 @@ __global__ void device_bootstrap_amortized(
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
auto bsk_mask_slice = PolynomialFourier<double2, params>(
auto bsk_mask_slice =
get_ith_mask_kth_block(bootstrapping_key, iteration, 0, level,
polynomial_size, 1, level_count));
auto bsk_body_slice = PolynomialFourier<double2, params>(
polynomial_size, 1, level_count);
auto bsk_body_slice =
get_ith_body_kth_block(bootstrapping_key, iteration, 0, level,
polynomial_size, 1, level_count));
polynomial_size, 1, level_count);
synchronize_threads_in_block();
// Perform the coefficient-wise product with the two pieces of
// bootstrapping key
polynomial_product_accumulate_in_fourier_domain(
polynomial_product_accumulate_in_fourier_domain<params, double2>(
mask_res_fft, accumulator_fft, bsk_mask_slice);
polynomial_product_accumulate_in_fourier_domain(
polynomial_product_accumulate_in_fourier_domain<params, double2>(
body_res_fft, accumulator_fft, bsk_body_slice);
synchronize_threads_in_block();
@@ -216,18 +216,18 @@ __global__ void device_bootstrap_amortized(
correction_direct_fft_inplace<params>(accumulator_fft);
auto bsk_mask_slice_2 = PolynomialFourier<double2, params>(
auto bsk_mask_slice_2 =
get_ith_mask_kth_block(bootstrapping_key, iteration, 1, level,
polynomial_size, 1, level_count));
auto bsk_body_slice_2 = PolynomialFourier<double2, params>(
polynomial_size, 1, level_count);
auto bsk_body_slice_2 =
get_ith_body_kth_block(bootstrapping_key, iteration, 1, level,
polynomial_size, 1, level_count));
polynomial_size, 1, level_count);
synchronize_threads_in_block();
polynomial_product_accumulate_in_fourier_domain(
polynomial_product_accumulate_in_fourier_domain<params, double2>(
mask_res_fft, accumulator_fft, bsk_mask_slice_2);
polynomial_product_accumulate_in_fourier_domain(
polynomial_product_accumulate_in_fourier_domain<params, double2>(
body_res_fft, accumulator_fft, bsk_body_slice_2);
}

View File

@@ -51,12 +51,12 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed,
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
auto bsk_mask_slice = PolynomialFourier<double2, params>(
auto bsk_mask_slice =
get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y,
blockIdx.x, polynomial_size, 1, level_count));
auto bsk_body_slice = PolynomialFourier<double2, params>(
blockIdx.x, polynomial_size, 1, level_count);
auto bsk_body_slice =
get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y,
blockIdx.x, polynomial_size, 1, level_count));
blockIdx.x, polynomial_size, 1, level_count);
// Perform the matrix multiplication between the GGSW and the GLWE,
// each block operating on a single level for mask and body
@@ -77,7 +77,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed,
// first product
for (int i = 0; i < params::opt / 2; i++) {
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
first_processed_acc[tid] = fft[tid] * first_processed_bsk[tid];
tid += params::degree / params::opt;
}
@@ -85,7 +85,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed,
tid = threadIdx.x;
// second product
for (int i = 0; i < params::opt / 2; i++) {
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
second_processed_acc[tid] += fft[tid] * second_processed_bsk[tid];
tid += params::degree / params::opt;
}

View File

@@ -25,7 +25,6 @@ public:
uint32_t m_size;
__device__ ExtraMemory(uint32_t size) : m_size(size) {}
};
template <typename T, class params> class PolynomialFourier;
template <typename T, class params> class Polynomial;
@@ -87,22 +86,6 @@ public:
synchronize_threads_in_block();
}
__device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
int i) {
int tid = threadIdx.x;
int begin = i * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
this->m_data[tid + begin] = source.m_values[tid];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
this->m_data[params::degree / 2 + begin] =
source.m_values[params::degree / 2];
}
}
__device__ void split_into_polynomials(Polynomial<T, params> &first,
Polynomial<T, params> &second) {
int tid = threadIdx.x;
@@ -115,77 +98,6 @@ public:
}
};
template <typename T, class params> class PolynomialFourier {
public:
T *m_values;
uint32_t degree;
__device__ __host__ PolynomialFourier(T *m_values) : m_values(m_values) {}
__device__ PolynomialFourier(SharedMemory &shmem) : degree(degree) {
shmem.get_allocation(&this->m_values, params::degree);
}
__device__ PolynomialFourier(SharedMemory &shmem, ExtraMemory extra_memory)
: degree(degree) {
shmem.get_allocation(&this->m_values, params::degree + extra_memory.m_size);
}
__device__ PolynomialFourier(SharedMemory &shmem, uint32_t degree)
: degree(degree) {
shmem.get_allocation(&this->m_values, degree);
}
__host__ PolynomialFourier(DeviceMemory &dmem, int device) : degree(degree) {
dmem.get_allocation(&this->m_values, params::degree, device);
}
__device__ char *reuse_memory() { return (char *)m_values; }
__device__ void copy_from(PolynomialFourier<T, params> &source, int begin) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
this->m_values[tid + begin] = source.m_values[tid];
tid = tid + params::degree / params::opt;
}
}
__device__ void fill_with(T value) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
m_values[tid] = value;
tid += params::degree / params::opt;
}
}
__device__ void swap_quarters_inplace() {
int tid = threadIdx.x;
int s1 = params::quarter;
int s2 = params::three_quarters;
T tmp = m_values[s2 + tid];
m_values[s2 + tid] = m_values[s1 + tid];
m_values[s1 + tid] = tmp;
}
__device__ void add_polynomial_inplace(VectorPolynomial<T, params> &source,
int polynomial_number) {
int tid = threadIdx.x;
int begin = polynomial_number * (params::degree / 2 + 1);
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
this->m_values[tid] += source.m_data[tid + begin];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
this->m_values[params::degree / 2] +=
source.m_data[params::degree / 2 + begin];
}
}
__device__ T &operator[](int i) { return m_values[i]; }
};
template <typename T, class params> class Polynomial {
public:
T *coefficients;
@@ -386,28 +298,6 @@ public:
}
}
__device__ void
to_complex_compressed(PolynomialFourier<double2, params> &dest) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
dest.m_values[tid].x = (double)coefficients[2 * tid];
dest.m_values[tid].y = (double)coefficients[2 * tid + 1];
tid += params::degree / params::opt;
}
}
__device__ void to_complex(PolynomialFourier<double2, params> &dest) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
dest.m_values[tid].x = (double)coefficients[tid];
dest.m_values[tid].y = 0.0;
tid += params::degree / params::opt;
}
}
__device__ void multiply_by_scalar_inplace(T scalar) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;

View File

@@ -14,9 +14,9 @@ __device__ void sub_polynomial(FT *result, FT *first, FT *second) {
}
}
template <class params, typename FT>
__device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
FT *second) {
template <class params, typename T>
__device__ void polynomial_product_in_fourier_domain(T *result, T *first,
T *second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = first[tid] * second[tid];
@@ -29,49 +29,6 @@ __device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
}
}
template <class params, typename FT>
__device__ void
polynomial_product_in_fourier_domain(PolynomialFourier<FT, params> &result,
PolynomialFourier<FT, params> &first,
PolynomialFourier<FT, params> &second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] = first[tid] * second[tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree / 2] =
first[params::degree / 2] * second[params::degree / 2];
}
}
template <class params, typename FT>
__device__ void polynomial_product_accumulate_in_fourier_domain(
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
PolynomialFourier<FT, params> &second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] += first[tid] * second[tid];
tid += params::degree / params::opt;
}
if (threadIdx.x == 0) {
result[params::degree / 2] +=
first[params::degree / 2] * second[params::degree / 2];
}
}
template <class params, typename FT>
__device__ void polynomial_product_accumulate_in_fourier_domain(
FT *result, FT *first, PolynomialFourier<FT, params> &second) {
int tid = threadIdx.x;
for (int i = 0; i < params::opt / 2; i++) {
result[tid] += first[tid] * second.m_values[tid];
tid += params::degree / params::opt;
}
}
template <class params, typename T>
__device__ void polynomial_product_accumulate_in_fourier_domain(T *result,
T *first,