mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-08 19:44:57 -05:00
refactor(cuda): remove PolynomialFourier
This commit is contained in:
@@ -187,20 +187,20 @@ __global__ void device_bootstrap_amortized(
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
auto bsk_mask_slice =
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, 0, level,
|
||||
polynomial_size, 1, level_count));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
polynomial_size, 1, level_count);
|
||||
auto bsk_body_slice =
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, 0, level,
|
||||
polynomial_size, 1, level_count));
|
||||
polynomial_size, 1, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the coefficient-wise product with the two pieces of
|
||||
// bootstrapping key
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice);
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, accumulator_fft, bsk_body_slice);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
@@ -216,18 +216,18 @@ __global__ void device_bootstrap_amortized(
|
||||
|
||||
correction_direct_fft_inplace<params>(accumulator_fft);
|
||||
|
||||
auto bsk_mask_slice_2 = PolynomialFourier<double2, params>(
|
||||
auto bsk_mask_slice_2 =
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, 1, level,
|
||||
polynomial_size, 1, level_count));
|
||||
auto bsk_body_slice_2 = PolynomialFourier<double2, params>(
|
||||
polynomial_size, 1, level_count);
|
||||
auto bsk_body_slice_2 =
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, 1, level,
|
||||
polynomial_size, 1, level_count));
|
||||
polynomial_size, 1, level_count);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice_2);
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, accumulator_fft, bsk_body_slice_2);
|
||||
}
|
||||
|
||||
|
||||
@@ -51,12 +51,12 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed,
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
auto bsk_mask_slice =
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y,
|
||||
blockIdx.x, polynomial_size, 1, level_count));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
blockIdx.x, polynomial_size, 1, level_count);
|
||||
auto bsk_body_slice =
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y,
|
||||
blockIdx.x, polynomial_size, 1, level_count));
|
||||
blockIdx.x, polynomial_size, 1, level_count);
|
||||
|
||||
// Perform the matrix multiplication between the GGSW and the GLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
@@ -77,7 +77,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed,
|
||||
|
||||
// first product
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
|
||||
first_processed_acc[tid] = fft[tid] * first_processed_bsk[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
@@ -85,7 +85,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed,
|
||||
tid = threadIdx.x;
|
||||
// second product
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
|
||||
second_processed_acc[tid] += fft[tid] * second_processed_bsk[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
|
||||
@@ -25,7 +25,6 @@ public:
|
||||
uint32_t m_size;
|
||||
__device__ ExtraMemory(uint32_t size) : m_size(size) {}
|
||||
};
|
||||
template <typename T, class params> class PolynomialFourier;
|
||||
|
||||
template <typename T, class params> class Polynomial;
|
||||
|
||||
@@ -87,22 +86,6 @@ public:
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
|
||||
int i) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = i * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_data[tid + begin] = source.m_values[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_data[params::degree / 2 + begin] =
|
||||
source.m_values[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void split_into_polynomials(Polynomial<T, params> &first,
|
||||
Polynomial<T, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
@@ -115,77 +98,6 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class params> class PolynomialFourier {
|
||||
public:
|
||||
T *m_values;
|
||||
uint32_t degree;
|
||||
|
||||
__device__ __host__ PolynomialFourier(T *m_values) : m_values(m_values) {}
|
||||
|
||||
__device__ PolynomialFourier(SharedMemory &shmem) : degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, params::degree);
|
||||
}
|
||||
|
||||
__device__ PolynomialFourier(SharedMemory &shmem, ExtraMemory extra_memory)
|
||||
: degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, params::degree + extra_memory.m_size);
|
||||
}
|
||||
__device__ PolynomialFourier(SharedMemory &shmem, uint32_t degree)
|
||||
: degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, degree);
|
||||
}
|
||||
|
||||
__host__ PolynomialFourier(DeviceMemory &dmem, int device) : degree(degree) {
|
||||
dmem.get_allocation(&this->m_values, params::degree, device);
|
||||
}
|
||||
|
||||
__device__ char *reuse_memory() { return (char *)m_values; }
|
||||
__device__ void copy_from(PolynomialFourier<T, params> &source, int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
this->m_values[tid + begin] = source.m_values[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
__device__ void fill_with(T value) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
m_values[tid] = value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void swap_quarters_inplace() {
|
||||
int tid = threadIdx.x;
|
||||
int s1 = params::quarter;
|
||||
int s2 = params::three_quarters;
|
||||
|
||||
T tmp = m_values[s2 + tid];
|
||||
m_values[s2 + tid] = m_values[s1 + tid];
|
||||
m_values[s1 + tid] = tmp;
|
||||
}
|
||||
|
||||
__device__ void add_polynomial_inplace(VectorPolynomial<T, params> &source,
|
||||
int polynomial_number) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = polynomial_number * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_values[tid] += source.m_data[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_values[params::degree / 2] +=
|
||||
source.m_data[params::degree / 2 + begin];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ T &operator[](int i) { return m_values[i]; }
|
||||
};
|
||||
|
||||
template <typename T, class params> class Polynomial {
|
||||
public:
|
||||
T *coefficients;
|
||||
@@ -386,28 +298,6 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void
|
||||
to_complex_compressed(PolynomialFourier<double2, params> &dest) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dest.m_values[tid].x = (double)coefficients[2 * tid];
|
||||
dest.m_values[tid].y = (double)coefficients[2 * tid + 1];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void to_complex(PolynomialFourier<double2, params> &dest) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
dest.m_values[tid].x = (double)coefficients[tid];
|
||||
dest.m_values[tid].y = 0.0;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_scalar_inplace(T scalar) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
|
||||
@@ -14,9 +14,9 @@ __device__ void sub_polynomial(FT *result, FT *first, FT *second) {
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
|
||||
FT *second) {
|
||||
template <class params, typename T>
|
||||
__device__ void polynomial_product_in_fourier_domain(T *result, T *first,
|
||||
T *second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
@@ -29,49 +29,6 @@ __device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void
|
||||
polynomial_product_in_fourier_domain(PolynomialFourier<FT, params> &result,
|
||||
PolynomialFourier<FT, params> &first,
|
||||
PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] =
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
|
||||
PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] +=
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
FT *result, FT *first, PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename T>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(T *result,
|
||||
T *first,
|
||||
|
||||
Reference in New Issue
Block a user