diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 2cde4af06..51d9ff9ff 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -187,20 +187,20 @@ __global__ void device_bootstrap_amortized( // Get the bootstrapping key piece necessary for the multiplication // It is already in the Fourier domain - auto bsk_mask_slice = PolynomialFourier( + auto bsk_mask_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, 0, level, - polynomial_size, 1, level_count)); - auto bsk_body_slice = PolynomialFourier( + polynomial_size, 1, level_count); + auto bsk_body_slice = get_ith_body_kth_block(bootstrapping_key, iteration, 0, level, - polynomial_size, 1, level_count)); + polynomial_size, 1, level_count); synchronize_threads_in_block(); // Perform the coefficient-wise product with the two pieces of // bootstrapping key - polynomial_product_accumulate_in_fourier_domain( + polynomial_product_accumulate_in_fourier_domain( mask_res_fft, accumulator_fft, bsk_mask_slice); - polynomial_product_accumulate_in_fourier_domain( + polynomial_product_accumulate_in_fourier_domain( body_res_fft, accumulator_fft, bsk_body_slice); synchronize_threads_in_block(); @@ -216,18 +216,18 @@ __global__ void device_bootstrap_amortized( correction_direct_fft_inplace(accumulator_fft); - auto bsk_mask_slice_2 = PolynomialFourier( + auto bsk_mask_slice_2 = get_ith_mask_kth_block(bootstrapping_key, iteration, 1, level, - polynomial_size, 1, level_count)); - auto bsk_body_slice_2 = PolynomialFourier( + polynomial_size, 1, level_count); + auto bsk_body_slice_2 = get_ith_body_kth_block(bootstrapping_key, iteration, 1, level, - polynomial_size, 1, level_count)); + polynomial_size, 1, level_count); synchronize_threads_in_block(); - polynomial_product_accumulate_in_fourier_domain( + polynomial_product_accumulate_in_fourier_domain( mask_res_fft, accumulator_fft, bsk_mask_slice_2); - polynomial_product_accumulate_in_fourier_domain( + polynomial_product_accumulate_in_fourier_domain( body_res_fft, accumulator_fft, bsk_body_slice_2); } diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 8ebe7a2eb..9c6206079 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -51,12 +51,12 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed, // needed to perform the external product in this block (corresponding to // the same decomposition level) - auto bsk_mask_slice = PolynomialFourier( + auto bsk_mask_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, blockIdx.y, - blockIdx.x, polynomial_size, 1, level_count)); - auto bsk_body_slice = PolynomialFourier( + blockIdx.x, polynomial_size, 1, level_count); + auto bsk_body_slice = get_ith_body_kth_block(bootstrapping_key, iteration, blockIdx.y, - blockIdx.x, polynomial_size, 1, level_count)); + blockIdx.x, polynomial_size, 1, level_count); // Perform the matrix multiplication between the GGSW and the GLWE, // each block operating on a single level for mask and body @@ -77,7 +77,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed, // first product for (int i = 0; i < params::opt / 2; i++) { - first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid]; + first_processed_acc[tid] = fft[tid] * first_processed_bsk[tid]; tid += params::degree / params::opt; } @@ -85,7 +85,7 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, int16_t *glwe_decomposed, tid = threadIdx.x; // second product for (int i = 0; i < params::opt / 2; i++) { - second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid]; + second_processed_acc[tid] += fft[tid] * second_processed_bsk[tid]; tid += params::degree / params::opt; } diff --git a/src/polynomial/polynomial.cuh b/src/polynomial/polynomial.cuh index df770d2e1..f6eaef294 100644 --- a/src/polynomial/polynomial.cuh +++ b/src/polynomial/polynomial.cuh @@ -25,7 +25,6 @@ public: uint32_t m_size; __device__ ExtraMemory(uint32_t size) : m_size(size) {} }; -template class PolynomialFourier; template class Polynomial; @@ -87,22 +86,6 @@ public: synchronize_threads_in_block(); } - __device__ void copy_into_ith_polynomial(PolynomialFourier &source, - int i) { - int tid = threadIdx.x; - int begin = i * (params::degree / 2 + 1); -#pragma unroll - for (int i = 0; i < params::opt / 2; i++) { - this->m_data[tid + begin] = source.m_values[tid]; - tid = tid + params::degree / params::opt; - } - - if (threadIdx.x == 0) { - this->m_data[params::degree / 2 + begin] = - source.m_values[params::degree / 2]; - } - } - __device__ void split_into_polynomials(Polynomial &first, Polynomial &second) { int tid = threadIdx.x; @@ -115,77 +98,6 @@ public: } }; -template class PolynomialFourier { -public: - T *m_values; - uint32_t degree; - - __device__ __host__ PolynomialFourier(T *m_values) : m_values(m_values) {} - - __device__ PolynomialFourier(SharedMemory &shmem) : degree(degree) { - shmem.get_allocation(&this->m_values, params::degree); - } - - __device__ PolynomialFourier(SharedMemory &shmem, ExtraMemory extra_memory) - : degree(degree) { - shmem.get_allocation(&this->m_values, params::degree + extra_memory.m_size); - } - __device__ PolynomialFourier(SharedMemory &shmem, uint32_t degree) - : degree(degree) { - shmem.get_allocation(&this->m_values, degree); - } - - __host__ PolynomialFourier(DeviceMemory &dmem, int device) : degree(degree) { - dmem.get_allocation(&this->m_values, params::degree, device); - } - - __device__ char *reuse_memory() { return (char *)m_values; } - __device__ void copy_from(PolynomialFourier &source, int begin) { - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - this->m_values[tid + begin] = source.m_values[tid]; - tid = tid + params::degree / params::opt; - } - } - __device__ void fill_with(T value) { - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - m_values[tid] = value; - tid += params::degree / params::opt; - } - } - - __device__ void swap_quarters_inplace() { - int tid = threadIdx.x; - int s1 = params::quarter; - int s2 = params::three_quarters; - - T tmp = m_values[s2 + tid]; - m_values[s2 + tid] = m_values[s1 + tid]; - m_values[s1 + tid] = tmp; - } - - __device__ void add_polynomial_inplace(VectorPolynomial &source, - int polynomial_number) { - int tid = threadIdx.x; - int begin = polynomial_number * (params::degree / 2 + 1); -#pragma unroll - for (int i = 0; i < params::opt / 2; i++) { - this->m_values[tid] += source.m_data[tid + begin]; - tid = tid + params::degree / params::opt; - } - - if (threadIdx.x == 0) { - this->m_values[params::degree / 2] += - source.m_data[params::degree / 2 + begin]; - } - } - - __device__ T &operator[](int i) { return m_values[i]; } -}; - template class Polynomial { public: T *coefficients; @@ -386,28 +298,6 @@ public: } } - __device__ void - to_complex_compressed(PolynomialFourier &dest) { - - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt / 2; i++) { - dest.m_values[tid].x = (double)coefficients[2 * tid]; - dest.m_values[tid].y = (double)coefficients[2 * tid + 1]; - tid += params::degree / params::opt; - } - } - - __device__ void to_complex(PolynomialFourier &dest) { - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - dest.m_values[tid].x = (double)coefficients[tid]; - dest.m_values[tid].y = 0.0; - tid += params::degree / params::opt; - } - } - __device__ void multiply_by_scalar_inplace(T scalar) { int tid = threadIdx.x; const int grid_dim = blockDim.x; diff --git a/src/polynomial/polynomial_math.cuh b/src/polynomial/polynomial_math.cuh index a36667b9a..71fc7924d 100644 --- a/src/polynomial/polynomial_math.cuh +++ b/src/polynomial/polynomial_math.cuh @@ -14,9 +14,9 @@ __device__ void sub_polynomial(FT *result, FT *first, FT *second) { } } -template -__device__ void polynomial_product_in_fourier_domain(FT *result, FT *first, - FT *second) { +template +__device__ void polynomial_product_in_fourier_domain(T *result, T *first, + T *second) { int tid = threadIdx.x; for (int i = 0; i < params::opt / 2; i++) { result[tid] = first[tid] * second[tid]; @@ -29,49 +29,6 @@ __device__ void polynomial_product_in_fourier_domain(FT *result, FT *first, } } -template -__device__ void -polynomial_product_in_fourier_domain(PolynomialFourier &result, - PolynomialFourier &first, - PolynomialFourier &second) { - int tid = threadIdx.x; - for (int i = 0; i < params::opt / 2; i++) { - result[tid] = first[tid] * second[tid]; - tid += params::degree / params::opt; - } - - if (threadIdx.x == 0) { - result[params::degree / 2] = - first[params::degree / 2] * second[params::degree / 2]; - } -} - -template -__device__ void polynomial_product_accumulate_in_fourier_domain( - PolynomialFourier &result, PolynomialFourier &first, - PolynomialFourier &second) { - int tid = threadIdx.x; - for (int i = 0; i < params::opt / 2; i++) { - result[tid] += first[tid] * second[tid]; - tid += params::degree / params::opt; - } - - if (threadIdx.x == 0) { - result[params::degree / 2] += - first[params::degree / 2] * second[params::degree / 2]; - } -} - -template -__device__ void polynomial_product_accumulate_in_fourier_domain( - FT *result, FT *first, PolynomialFourier &second) { - int tid = threadIdx.x; - for (int i = 0; i < params::opt / 2; i++) { - result[tid] += first[tid] * second.m_values[tid]; - tid += params::degree / params::opt; - } -} - template __device__ void polynomial_product_accumulate_in_fourier_domain(T *result, T *first,