mirror of
https://github.com/zama-ai/concrete.git
synced 2026-02-10 04:35:03 -05:00
feat(cuda): introduce cuda acceleration for the pbs and keyswitch
- a new crate concrete-cuda is added to the repository, containing some Cuda implementations for the bootstrap and keyswitch and a Rust wrapping to call them - a new backend_cuda is added to concrete-core, with dedicated entities whose memory is located on the GPU and engines that call the Cuda accelerated functions
This commit is contained in:
10
src/CMakeLists.txt
Normal file
10
src/CMakeLists.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
|
||||
${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h)
|
||||
file(GLOB SOURCES
|
||||
"*.cu"
|
||||
"*.h"
|
||||
"fft/*.cu")
|
||||
add_library(concrete_cuda STATIC ${SOURCES})
|
||||
set_target_properties(concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
|
||||
target_link_libraries(concrete_cuda PUBLIC cudart)
|
||||
target_include_directories(concrete_cuda PRIVATE .)
|
||||
1
src/bootstrap.cu
Normal file
1
src/bootstrap.cu
Normal file
@@ -0,0 +1 @@
|
||||
#include "crypto/bootstrapping_key.cuh"
|
||||
174
src/bootstrap_amortized.cu
Normal file
174
src/bootstrap_amortized.cu
Normal file
@@ -0,0 +1,174 @@
|
||||
#include "bootstrap_amortized.cuh"
|
||||
|
||||
/* Perform bootstrapping on a batch of input LWE ciphertexts
|
||||
*
|
||||
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* Note: it is necessary to generate (k+1).k.l.N.n
|
||||
* uniformly random coefficients for the zero encryptions
|
||||
* - input_lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - q: number of bytes in the integer representation (32 or 64)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples blocks of threads are launched, where each thread is going
|
||||
* to handle one or more polynomial coefficients at each stage:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - decompose into l_gadget levels, then for each level:
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint32_t, Degree<512>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint32_t, Degree<1024>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint32_t, Degree<2048>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint32_t, Degree<4096>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint32_t, Degree<8192>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_amortized<uint64_t, Degree<512>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_amortized<uint64_t, Degree<1024>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_amortized<uint64_t, Degree<2048>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_amortized<uint64_t, Degree<4096>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_amortized<uint64_t, Degree<8192>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
|
||||
num_lut_vectors, lwe_idx, max_shared_memory);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
466
src/bootstrap_amortized.cuh
Normal file
466
src/bootstrap_amortized.cuh
Normal file
@@ -0,0 +1,466 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#ifndef CNCRT_AMORTIZED_PBS_H
|
||||
#define CNCRT_AMORTIZED_PBS_H
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
//#include "crypto/bootstrapping_key.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
template <typename Torus, class params, sharedMemDegree SMD>
|
||||
/*
|
||||
* Kernel launched by host_bootstrap_amortized
|
||||
*
|
||||
* Uses shared memory to increase performance
|
||||
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to which test vector
|
||||
* to use for each sample in lut_vector
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n mask
|
||||
* values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
|
||||
* key sk2
|
||||
* - device_mem: pointer to the device's global memory in case we use it (SMD
|
||||
* == NOSM or PARTIALSM)
|
||||
* - lwe_mask_size: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - gpu_num: index of the current GPU (useful for multi-GPU computations)
|
||||
* - lwe_idx: equal to the number of samples per gpu x gpu_num
|
||||
* - device_memory_size_per_sample: amount of global memory to allocate if SMD
|
||||
* is not FULLSM
|
||||
*/
|
||||
__global__ void device_bootstrap_amortized(
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
char *device_mem,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t lwe_idx,
|
||||
size_t device_memory_size_per_sample) {
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ char sharedmem[];
|
||||
char *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = sharedmem;
|
||||
else
|
||||
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
|
||||
|
||||
// For GPU bootstrapping the RLWE dimension is hard-set to 1: there is only
|
||||
// one mask polynomial and 1 body to handle Also, since the decomposed
|
||||
// polynomials take coefficients between -B/2 and B/2 they can be represented
|
||||
// with only 16 bits, assuming the base log does not exceed 2^16
|
||||
int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory;
|
||||
// TODO (Agnes) why not the 16 bits representation here?
|
||||
int16_t *accumulator_body_decomposed =
|
||||
(int16_t *)accumulator_mask_decomposed + polynomial_size;
|
||||
Torus *accumulator_mask = (Torus *)accumulator_body_decomposed +
|
||||
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
|
||||
Torus *accumulator_body =
|
||||
(Torus *)accumulator_mask + (ptrdiff_t)polynomial_size;
|
||||
Torus *accumulator_mask_rotated =
|
||||
(Torus *)accumulator_body + (ptrdiff_t)polynomial_size;
|
||||
Torus *accumulator_body_rotated =
|
||||
(Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size;
|
||||
double2 *mask_res_fft = (double2 *)accumulator_body_rotated +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
double2 *body_res_fft =
|
||||
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
|
||||
double2 *accumulator_fft = (double2 *)sharedmem;
|
||||
if constexpr (SMD != PARTIALSM)
|
||||
accumulator_fft =
|
||||
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
|
||||
|
||||
/*
|
||||
int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory);
|
||||
int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed);
|
||||
int dif2 = ((char*)accumulator_body - (char*)accumulator_mask);
|
||||
int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body);
|
||||
int dif4 = ((char*)accumulator_body_rotated -
|
||||
(char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft -
|
||||
(char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft -
|
||||
(char*)mask_res_fft); int dif7 = (SMD != PARTIALSM)? (char*)accumulator_fft -
|
||||
(char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2,
|
||||
dif3, dif4, dif5, dif6, dif7);
|
||||
}
|
||||
*/
|
||||
|
||||
auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
|
||||
Torus *block_lut_vector =
|
||||
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
|
||||
|
||||
// TODO (Agnes) try to store the gadget matrix in const memory to see if
|
||||
// register use decreases Since all const mem is used for twiddles currently,
|
||||
// it would mean moving some of them to global memory instead
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
// Put "b", the body, in [0, 2N[
|
||||
Torus b_hat = rescale_torus_element(
|
||||
block_lwe_in[lwe_mask_size],
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_mask, block_lut_vector, b_hat, false);
|
||||
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_body, &block_lut_vector[params::degree], b_hat, false);
|
||||
|
||||
// Loop over all the mask elements of the sample to accumulate
|
||||
// (X^a_i-1) multiplication, decomposition of the resulting polynomial
|
||||
// into l_gadget polynomials, and performing polynomial multiplication
|
||||
// via an FFT with the RGSW encrypted secret key
|
||||
for (int iteration = 0; iteration < lwe_mask_size; iteration++) {
|
||||
// TODO make sure that following sync is necessary
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Put "a" in [0, 2N[ instead of Zq
|
||||
Torus a_hat = rescale_torus_element(
|
||||
block_lwe_in[iteration],
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
// TODO (Agnes) why is there this if condition?
|
||||
if (a_hat == 0) {
|
||||
// todo(Joao): **cannot use this optimization**
|
||||
// the reason is that one of the input ciphertexts (blockIdx.z)
|
||||
// might skip an iteration while others don't, which as a result
|
||||
// will make that block not call the grid.sync(), causing a deadlock;
|
||||
// maybe it's a workaround to add grid.sync() here, but not sure if
|
||||
// there are any edge cases?
|
||||
|
||||
// continue
|
||||
}
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator_mask, accumulator_mask_rotated, a_hat);
|
||||
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator_body, accumulator_body_rotated, a_hat);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_mask_rotated, base_log, l_gadget);
|
||||
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_body_rotated, base_log, l_gadget);
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
// The polynomial multiplications happens at the block level
|
||||
// and each thread handles two or more coefficients
|
||||
int pos = threadIdx.x;
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
mask_res_fft[pos].x = 0;
|
||||
mask_res_fft[pos].y = 0;
|
||||
body_res_fft[pos].x = 0;
|
||||
body_res_fft[pos].y = 0;
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Now that the rotation is done, decompose the resulting polynomial
|
||||
// coefficients so as to multiply each decomposed level with the
|
||||
// corresponding part of the bootstrapping key
|
||||
// TODO (Agnes) explain why we do that for the mask and body separately
|
||||
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
|
||||
|
||||
gadget.decompose_one_level(accumulator_mask_decomposed,
|
||||
accumulator_mask_rotated, decomp_level);
|
||||
|
||||
gadget.decompose_one_level(accumulator_body_decomposed,
|
||||
accumulator_body_rotated, decomp_level);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// First, perform the polynomial multiplication for the mask
|
||||
|
||||
// Reduce the size of the FFT to be performed by storing
|
||||
// the real-valued polynomial into a complex polynomial
|
||||
real_to_complex_compressed<int16_t, params>(accumulator_mask_decomposed,
|
||||
accumulator_fft);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(accumulator_fft);
|
||||
|
||||
// Get the bootstrapping key piece necessary for the multiplication
|
||||
// It is already in the Fourier domain
|
||||
// TODO (Agnes) Explain why for the mask polynomial multiplication
|
||||
// we need the bsk_body_slice and vice versa
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, 0, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(
|
||||
bootstrapping_key, iteration, 0, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the coefficient-wise product with the two pieces of
|
||||
// bootstrapping key TODO (Agnes) why two pieces?
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice);
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
body_res_fft, accumulator_fft, bsk_body_slice);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Now handle the polynomial multiplication for the body
|
||||
// in the same way
|
||||
real_to_complex_compressed<int16_t, params>(accumulator_body_decomposed,
|
||||
accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(accumulator_fft);
|
||||
|
||||
auto bsk_mask_slice_2 = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(bootstrapping_key, iteration, 1, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice_2 = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(bootstrapping_key, iteration, 1, decomp_level,
|
||||
polynomial_size, 1, l_gadget));
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
mask_res_fft, accumulator_fft, bsk_mask_slice_2);
|
||||
polynomial_product_accumulate_in_fourier_domain(
|
||||
body_res_fft, accumulator_fft, bsk_body_slice_2);
|
||||
}
|
||||
|
||||
// Come back to the coefficient representation
|
||||
if constexpr (SMD == FULLSM || SMD == NOSM) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(mask_res_fft);
|
||||
correction_inverse_fft_inplace<params>(body_res_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(mask_res_fft);
|
||||
NSMFFT_inverse<HalfDegree<params>>(body_res_fft);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(mask_res_fft, accumulator_mask);
|
||||
add_to_torus<Torus, params>(body_res_fft, accumulator_body);
|
||||
synchronize_threads_in_block();
|
||||
} else {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
accumulator_fft[tid] = mask_res_fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_mask);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
accumulator_fft[tid] = body_res_fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(accumulator_fft, accumulator_body);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
}
|
||||
|
||||
auto block_lwe_out = &lwe_out[blockIdx.x * (polynomial_size + 1)];
|
||||
|
||||
// The blind rotation for this block is over
|
||||
// Now we can perform the sample extraction: for the body it's just
|
||||
// the resulting constant coefficient of the accumulator
|
||||
// For the mask it's more complicated TODO (Agnes) explain why
|
||||
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_mask);
|
||||
sample_extract_body<Torus, params>(block_lwe_out, accumulator_body);
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_amortized(
|
||||
void *v_stream,
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
uint32_t input_lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t input_lwe_ciphertext_count,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
int SM_FULL = sizeof(Torus) * polynomial_size + // accumulator mask
|
||||
sizeof(Torus) * polynomial_size + // accumulator body
|
||||
sizeof(Torus) * polynomial_size + // accumulator mask rotated
|
||||
sizeof(Torus) * polynomial_size + // accumulator body rotated
|
||||
sizeof(int16_t) * polynomial_size + // accumulator_dec mask
|
||||
sizeof(int16_t) * polynomial_size + // accumulator_dec_body
|
||||
sizeof(double2) * polynomial_size / 2 + // accumulator fft mask
|
||||
sizeof(double2) * polynomial_size / 2 + // accumulator fft body
|
||||
sizeof(double2) * polynomial_size / 2; // calculate buffer fft
|
||||
|
||||
int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft
|
||||
|
||||
int DM_PART = SM_FULL - SM_PART;
|
||||
|
||||
int DM_FULL = SM_FULL;
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
char *d_mem;
|
||||
|
||||
// Create a 1-dimensional grid of threads
|
||||
// where each block handles 1 sample and each thread
|
||||
// handles opt polynomial coefficients
|
||||
// (actually opt/2 coefficients since we compress the real polynomial into a
|
||||
// complex)
|
||||
// TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512
|
||||
// probably, maybe 1024 would be too big?
|
||||
// Or would it actually be good in our case to have the largest possible
|
||||
// number of threads per block since anyway few blocks will run
|
||||
// concurrently?
|
||||
dim3 grid(input_lwe_ciphertext_count, 1, 1);
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
// Launch the kernel using polynomial_size/opt threads
|
||||
// where each thread computes opt polynomial coefficients
|
||||
// Depending on the required amount of shared memory, choose
|
||||
// from one of three templates (no use, partial use or full use
|
||||
// of shared memory)
|
||||
if (max_shared_memory < SM_PART) {
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count));
|
||||
device_bootstrap_amortized<Torus, params, NOSM>
|
||||
<<<grid, thds, 0, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes, lwe_in,
|
||||
bootstrapping_key, d_mem,
|
||||
input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx, DM_FULL);
|
||||
} else if (max_shared_memory < SM_FULL) {
|
||||
cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
SM_PART);
|
||||
cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>,
|
||||
cudaFuncCachePreferShared);
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count));
|
||||
device_bootstrap_amortized<Torus, params, PARTIALSM>
|
||||
<<<grid, thds, SM_PART, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes,
|
||||
lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx,
|
||||
DM_PART);
|
||||
} else {
|
||||
// For devices with compute capability 7.x a single thread block can
|
||||
// address the full capacity of shared memory. Shared memory on the
|
||||
// device then has to be allocated dynamically.
|
||||
// For lower compute capabilities, this call
|
||||
// just does nothing and the amount of shared memory used is 48 KB
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
SM_FULL));
|
||||
// TODO (Agnes): is this necessary?
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_mem, 0));
|
||||
|
||||
device_bootstrap_amortized<Torus, params, FULLSM>
|
||||
<<<grid, thds, SM_FULL, *stream>>>(
|
||||
lwe_out, lut_vector, lut_vector_indexes,
|
||||
lwe_in, bootstrapping_key,
|
||||
d_mem, input_lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, lwe_idx,
|
||||
0);
|
||||
}
|
||||
// Synchronize the streams before copying the result to lwe_out at the right
|
||||
// place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cudaFree(d_mem);
|
||||
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
int cuda_get_pbs_per_gpu(int polynomial_size) {
|
||||
|
||||
int blocks_per_sm = 0;
|
||||
int num_threads = polynomial_size / params::opt;
|
||||
cudaGetDeviceCount(0);
|
||||
cudaDeviceProp device_properties;
|
||||
// FIXME: here we assume every device has same properties
|
||||
cudaGetDeviceProperties(&device_properties, 0);
|
||||
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&blocks_per_sm, device_bootstrap_amortized<Torus, params>,
|
||||
num_threads, 0);
|
||||
|
||||
return device_properties.multiProcessorCount * blocks_per_sm;
|
||||
}
|
||||
|
||||
#endif // CNCRT_PBS_H
|
||||
183
src/bootstrap_low_latency.cu
Normal file
183
src/bootstrap_low_latency.cu
Normal file
@@ -0,0 +1,183 @@
|
||||
#include "bootstrap_low_latency.cuh"
|
||||
|
||||
/* Perform bootstrapping on a batch of input LWE ciphertexts
|
||||
*
|
||||
* - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lut_vector: should hold as many test vectors of size polynomial_size
|
||||
* as there are input ciphertexts, but actually holds
|
||||
* num_lut_vectors vectors to reduce memory usage
|
||||
* - lut_vector_indexes: stores the index corresponding to
|
||||
* which test vector to use for each sample in
|
||||
* lut_vector
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
* - bootstrapping_key: RGSW encryption of the LWE secret key sk1
|
||||
* under secret key sk2
|
||||
* bsk = Z + sk1 H
|
||||
* where H is the gadget matrix and Z is a matrix (k+1).l
|
||||
* containing GLWE encryptions of 0 under sk2.
|
||||
* bsk is thus a tensor of size (k+1)^2.l.N.n
|
||||
* where l is the number of decomposition levels and
|
||||
* k is the GLWE dimension, N is the polynomial size for
|
||||
* GLWE. The polynomial size for GLWE and the test vector
|
||||
* are the same because they have to be in the same ring
|
||||
* to be multiplied.
|
||||
* Note: it is necessary to generate (k+1).k.l.N.n
|
||||
* uniformly random coefficients for the zero encryptions
|
||||
* - lwe_dimension: size of the Torus vector used to encrypt the input
|
||||
* LWE ciphertexts - referred to as n above (~ 600)
|
||||
* - polynomial_size: size of the test polynomial (test vector) and size of the
|
||||
* GLWE polynomial (~1024)
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - num_samples: number of encrypted input messages
|
||||
* - num_lut_vectors: parameter to set the actual number of test vectors to be
|
||||
* used
|
||||
* - q: number of bytes in the integer representation (32 or 64)
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the
|
||||
* bootstrapping:
|
||||
* - the kernel is templatized based on integer discretization and
|
||||
* polynomial degree
|
||||
* - num_samples blocks of threads are launched, where each thread is going
|
||||
* to handle one or more polynomial coefficients at each stage:
|
||||
* - perform the blind rotation
|
||||
* - round the result
|
||||
* - decompose into l_gadget levels, then for each level:
|
||||
* - switch to the FFT domain
|
||||
* - multiply with the bootstrapping key
|
||||
* - come back to the coefficients representation
|
||||
* - between each stage a synchronization of the threads is necessary TODO
|
||||
* (Agnes) check this
|
||||
* - in case the device has enough shared memory, temporary arrays used for
|
||||
* the different stages (accumulators) are stored into the shared memory
|
||||
* - the accumulators serve to combine the results for all decomposition
|
||||
* levels TODO (Agnes) check this
|
||||
* - the constant memory (64K) is used for storing the roots of identity
|
||||
* values for the FFT
|
||||
*/
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<512>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<1024>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<2048>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<4096>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint32_t, Degree<8192>>(
|
||||
v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
void *v_stream,
|
||||
void *lwe_out,
|
||||
void *lut_vector,
|
||||
void *lut_vector_indexes,
|
||||
void *lwe_in,
|
||||
void *bootstrapping_key,
|
||||
uint32_t lwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<512>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 1024:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<1024>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 2048:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<2048>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 4096:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<4096>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
case 8192:
|
||||
host_bootstrap_low_latency<uint64_t, Degree<8192>>(
|
||||
v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
|
||||
(uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
|
||||
(double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
|
||||
base_log, l_gadget, num_samples,
|
||||
num_lut_vectors);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
348
src/bootstrap_low_latency.cuh
Normal file
348
src/bootstrap_low_latency.cuh
Normal file
@@ -0,0 +1,348 @@
|
||||
#ifdef __CDT_PARSER__
|
||||
#undef __CUDA_RUNTIME_H__
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
#endif
|
||||
|
||||
#ifndef LOWLAT_PBS_H
|
||||
#define LOWLAT_PBS_H
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
// Cooperative groups are used in the low latency
|
||||
// version of the bootstrapping
|
||||
using namespace cooperative_groups;
|
||||
namespace cg = cooperative_groups;
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void
|
||||
mul_trgsw_trlwe(Torus *accumulator,
|
||||
double2 *fft,
|
||||
int16_t *trlwe_decomposed,
|
||||
double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer,
|
||||
double2 *bootstrapping_key,
|
||||
int polynomial_size, int l_gadget, int iteration, grid_group &grid) {
|
||||
|
||||
// Put the decomposed TRLWE sample in the Fourier domain
|
||||
real_to_complex_compressed<int16_t, params>(trlwe_decomposed,
|
||||
fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
|
||||
|
||||
// Get the pieces of the bootstrapping key that will be needed for the
|
||||
// external product; blockIdx.x is the ID of the block that's executing
|
||||
// this function, so we end up getting the lines of the bootstrapping key
|
||||
// needed to perform the external product in this block (corresponding to
|
||||
// the same decomposition level)
|
||||
|
||||
// auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block(
|
||||
// gpu_num, iteration, blockIdx.y, blockIdx.x);
|
||||
// auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block(
|
||||
// gpu_num, iteration, blockIdx.y, blockIdx.x);
|
||||
|
||||
auto bsk_mask_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_mask_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
|
||||
polynomial_size, 1, l_gadget));
|
||||
auto bsk_body_slice = PolynomialFourier<double2, params>(
|
||||
get_ith_body_kth_block(
|
||||
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
|
||||
polynomial_size, 1, l_gadget));
|
||||
|
||||
// Perform the matrix multiplication between the RGSW and the TRLWE,
|
||||
// each block operating on a single level for mask and body
|
||||
|
||||
auto first_processed_bsk = (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
|
||||
auto second_processed_bsk = (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
|
||||
|
||||
auto first_processed_acc = (blockIdx.y == 0) ?
|
||||
&mask_join_buffer[params::degree / 2 * blockIdx.x] :
|
||||
&body_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
auto second_processed_acc = (blockIdx.y == 0) ?
|
||||
&body_join_buffer[params::degree / 2 * blockIdx.x] :
|
||||
&mask_join_buffer[params::degree / 2 * blockIdx.x];
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
//first product
|
||||
for(int i = 0; i < params::opt / 2; i++) {
|
||||
first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
grid.sync();
|
||||
tid = threadIdx.x;
|
||||
//second product
|
||||
for(int i = 0; i < params::opt / 2; i++) {
|
||||
second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
|
||||
// -----------------------------------------------------------------
|
||||
|
||||
// All blocks are synchronized here; after this sync, *_join_buffer has the
|
||||
// values needed from every other block
|
||||
grid.sync();
|
||||
|
||||
auto src_acc = (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
|
||||
|
||||
// copy first product into fft buffer
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// accumulate rest of the products into fft buffer
|
||||
for (int l = 1; l < gridDim.x; l++) {
|
||||
auto cur_src_acc = &src_acc[l * params::degree / 2];
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] += cur_src_acc[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the inverse FFT on the result of the RGSWxTRLWE and add to the
|
||||
// accumulator
|
||||
NSMFFT_inverse<HalfDegree<params>>(fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
add_to_torus<Torus, params>(fft, accumulator);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
/*
|
||||
* Kernel launched by the low latency version of the
|
||||
* bootstrapping, that uses cooperative groups
|
||||
* lwe_out vector of output lwe s, with length (polynomial_size+1)*num_samples
|
||||
* lut_vector - vector of look up tables with length polynomial_size * num_samples
|
||||
* lut_vector_indexes - mapping between lwe_in and lut_vector
|
||||
* lwe_in - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples
|
||||
*
|
||||
*/
|
||||
__global__ void device_bootstrap_low_latency(
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
double2 *mask_join_buffer,
|
||||
double2 *body_join_buffer,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget
|
||||
) {
|
||||
|
||||
grid_group grid = this_grid();
|
||||
|
||||
// We use shared memory for the polynomials that are used often during the
|
||||
// bootstrap, since shared memory is kept in L1 cache and accessing it is
|
||||
// much faster than global memory
|
||||
extern __shared__ char sharedmem[];
|
||||
|
||||
char* selected_memory = sharedmem;
|
||||
|
||||
int16_t *accumulator_decomposed = (int16_t *)selected_memory;
|
||||
Torus *accumulator = (Torus*)accumulator_decomposed +
|
||||
polynomial_size / (sizeof(Torus) / sizeof(int16_t));
|
||||
double2 *accumulator_fft = (double2*)accumulator +
|
||||
polynomial_size / (sizeof(double2) / sizeof(Torus));
|
||||
|
||||
// Reuse memory from accumulator_fft for accumulator_rotated
|
||||
Torus* accumulator_rotated = (Torus*)accumulator_fft;
|
||||
|
||||
// The third dimension of the block is used to determine on which ciphertext
|
||||
// this block is operating, in the case of batch bootstraps
|
||||
auto block_lwe_in = &lwe_in[blockIdx.z * (lwe_mask_size + 1)];
|
||||
|
||||
auto block_lut_vector =
|
||||
&lut_vector[blockIdx.z * params::degree * 2];
|
||||
|
||||
auto block_mask_join_buffer = &mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
|
||||
auto block_body_join_buffer = &body_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
|
||||
|
||||
// Since the space is L1 cache is small, we use the same memory location for
|
||||
// the rotated accumulator and the fft accumulator, since we know that the
|
||||
// rotated array is not in use anymore by the time we perform the fft
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
// Put "b" in [0, 2N[
|
||||
Torus b_hat = rescale_torus_element(
|
||||
block_lwe_in[lwe_mask_size],
|
||||
2 * params::degree);
|
||||
|
||||
if (blockIdx.y == 0) {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, block_lut_vector, b_hat, false);
|
||||
}
|
||||
else {
|
||||
divide_by_monomial_negacyclic_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator, &block_lut_vector[params::degree], b_hat, false);
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_mask_size; i++) {
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Put "a" in [0, 2N[
|
||||
Torus a_hat = rescale_torus_element(
|
||||
block_lwe_in[i],
|
||||
2 * params::degree); // 2 * params::log2_degree + 1);
|
||||
|
||||
if (a_hat == 0) {
|
||||
// todo(Joao): **cannot use this optimization**
|
||||
// the reason is that one of the input ciphertexts (blockIdx.z)
|
||||
// might skip an iteration while others don't, which as a result
|
||||
// will make that block not call the grid.sync(), causing a deadlock;
|
||||
// maybe it's a workaround to add grid.sync() here, but not sure if
|
||||
// there are any edge cases?
|
||||
|
||||
// continue
|
||||
}
|
||||
|
||||
// Perform ACC * (X^ä - 1)
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial<
|
||||
Torus, params::opt, params::degree / params::opt>(
|
||||
accumulator, accumulator_rotated, a_hat);
|
||||
|
||||
|
||||
// Perform a rounding to increase the accuracy of the
|
||||
// bootstrapped ciphertext
|
||||
round_to_closest_multiple_inplace<Torus, params::opt,
|
||||
params::degree / params::opt>(
|
||||
accumulator_rotated, base_log, l_gadget);
|
||||
|
||||
|
||||
|
||||
// Decompose the accumulator. Each block gets one level of the
|
||||
// decomposition, for the mask and the body (so block 0 will have the
|
||||
// accumulator decomposed at level 0, 1 at 1, etc.)
|
||||
gadget.decompose_one_level(accumulator_decomposed, accumulator_rotated,
|
||||
blockIdx.x);
|
||||
|
||||
// We are using the same memory space for accumulator_fft and
|
||||
// accumulator_rotated, so we need to synchronize here to make sure they
|
||||
// don't modify the same memory space at the same time
|
||||
synchronize_threads_in_block();
|
||||
// Perform G^-1(ACC) * RGSW -> TRLWE
|
||||
mul_trgsw_trlwe<Torus, params>(
|
||||
accumulator,
|
||||
accumulator_fft,
|
||||
accumulator_decomposed,
|
||||
block_mask_join_buffer,
|
||||
block_body_join_buffer,
|
||||
bootstrapping_key,
|
||||
polynomial_size, l_gadget, i, grid);
|
||||
}
|
||||
|
||||
auto block_lwe_out = &lwe_out[blockIdx.z * (polynomial_size + 1)];
|
||||
|
||||
if (blockIdx.x == 0 && blockIdx.y == 0) {
|
||||
// Perform a sample extract. At this point, all blocks have the result, but
|
||||
// we do the computation at block 0 to avoid waiting for extra blocks, in
|
||||
// case they're not synchronized
|
||||
sample_extract_mask<Torus, params>(block_lwe_out, accumulator);
|
||||
} else if (blockIdx.x == 0 && blockIdx.y == 1) {
|
||||
sample_extract_body<Torus, params>(block_lwe_out, accumulator);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Host wrapper to the low latency version
|
||||
* of bootstrapping
|
||||
*/
|
||||
template <typename Torus, class params>
|
||||
__host__ void host_bootstrap_low_latency(
|
||||
void *v_stream,
|
||||
Torus *lwe_out,
|
||||
Torus *lut_vector,
|
||||
uint32_t *lut_vector_indexes,
|
||||
Torus *lwe_in,
|
||||
double2 *bootstrapping_key,
|
||||
uint32_t lwe_mask_size,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples,
|
||||
uint32_t num_lut_vectors) {
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
|
||||
int buffer_size_per_gpu = l_gadget * num_samples * polynomial_size / 2 * sizeof(double2);
|
||||
double2 *mask_buffer_fft;
|
||||
double2 *body_buffer_fft;
|
||||
checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu));
|
||||
checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu));
|
||||
|
||||
|
||||
int bytes_needed =
|
||||
sizeof(int16_t) * polynomial_size + // accumulator_decomp
|
||||
sizeof(Torus) * polynomial_size + // accumulator
|
||||
sizeof(double2) * polynomial_size / 2; // accumulator fft
|
||||
|
||||
int thds = polynomial_size / params::opt;
|
||||
dim3 grid(l_gadget, 2, num_samples);
|
||||
|
||||
void *kernel_args[10];
|
||||
kernel_args[0] = &lwe_out;
|
||||
kernel_args[1] = &lut_vector;
|
||||
kernel_args[2] = &lwe_in;
|
||||
kernel_args[3] = &bootstrapping_key;
|
||||
kernel_args[4] = &mask_buffer_fft;
|
||||
kernel_args[5] = &body_buffer_fft;
|
||||
kernel_args[6] = &lwe_mask_size;
|
||||
kernel_args[7] = &polynomial_size;
|
||||
kernel_args[8] = &base_log;
|
||||
kernel_args[9] =&l_gadget;
|
||||
|
||||
checkCudaErrors(cudaFuncSetAttribute(device_bootstrap_low_latency<Torus,
|
||||
params>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
bytes_needed));
|
||||
cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params>,
|
||||
cudaFuncCachePreferShared);
|
||||
|
||||
checkCudaErrors(cudaLaunchCooperativeKernel ( (void *)device_bootstrap_low_latency<Torus, params>, grid, thds, (void**)kernel_args, bytes_needed, *stream )) ;
|
||||
|
||||
// Synchronize the streams before copying the result to lwe_out at the right
|
||||
// place
|
||||
cudaStreamSynchronize(*stream);
|
||||
cudaFree(mask_buffer_fft);
|
||||
cudaFree(body_buffer_fft);
|
||||
}
|
||||
|
||||
#endif // LOWLAT_PBS_H
|
||||
87
src/complex/operations.cuh
Normal file
87
src/complex/operations.cuh
Normal file
@@ -0,0 +1,87 @@
|
||||
#ifndef GPU_BOOTSTRAP_COMMON_CUH
|
||||
#define GPU_BOOTSTRAP_COMMON_CUH
|
||||
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
|
||||
#define SNT 1
|
||||
#define dPI 6.283185307179586231995926937088
|
||||
|
||||
using sTorus = int32_t;
|
||||
// using Torus = uint32_t;
|
||||
using sTorus = int32_t;
|
||||
using u32 = uint32_t;
|
||||
using i32 = int32_t;
|
||||
|
||||
//--------------------------------------------------
|
||||
// Basic double2 operations
|
||||
|
||||
__device__ inline double2 conjugate(const double2 num) {
|
||||
double2 res;
|
||||
res.x = num.x;
|
||||
res.y = -num.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline void operator+=(double2 &lh, const double2 rh) {
|
||||
lh.x += rh.x;
|
||||
lh.y += rh.y;
|
||||
}
|
||||
|
||||
__device__ inline void operator-=(double2 &lh, const double2 rh) {
|
||||
lh.x -= rh.x;
|
||||
lh.y -= rh.y;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator+(const double2 a, const double2 b) {
|
||||
double2 res;
|
||||
res.x = a.x + b.x;
|
||||
res.y = a.y + b.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator-(const double2 a, const double2 b) {
|
||||
double2 res;
|
||||
res.x = a.x - b.x;
|
||||
res.y = a.y - b.y;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(const double2 a, const double2 b) {
|
||||
double xx = a.x * b.x;
|
||||
double xy = a.x * b.y;
|
||||
double yx = a.y * b.x;
|
||||
double yy = a.y * b.y;
|
||||
|
||||
double2 res;
|
||||
// asm volatile("fma.rn.f64 %0, %1, %2, %3;": "=d"(res.x) : "d"(a.x),
|
||||
// "d"(b.x), "d"(yy));
|
||||
res.x = xx - yy;
|
||||
res.y = xy + yx;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(const double2 a, double b) {
|
||||
double2 res;
|
||||
res.x = a.x * b;
|
||||
res.y = a.y * b;
|
||||
return res;
|
||||
}
|
||||
|
||||
__device__ inline void operator*=(double2 &a, const double2 b) {
|
||||
double tmp = a.x;
|
||||
a.x *= b.x;
|
||||
a.x -= a.y * b.y;
|
||||
a.y *= b.x;
|
||||
a.y += b.y * tmp;
|
||||
}
|
||||
|
||||
__device__ inline double2 operator*(double a, double2 b) {
|
||||
double2 res;
|
||||
res.x = b.x * a;
|
||||
res.y = b.y * a;
|
||||
return res;
|
||||
}
|
||||
|
||||
#endif
|
||||
157
src/crypto/bootstrapping_key.cuh
Normal file
157
src/crypto/bootstrapping_key.cuh
Normal file
@@ -0,0 +1,157 @@
|
||||
#ifndef CNCRT_BSK_H
|
||||
#define CNCRT_BSK_H
|
||||
|
||||
#include "bootstrap.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget) {
|
||||
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
|
||||
}
|
||||
|
||||
__device__ double2*
|
||||
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
__device__ double2*
|
||||
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1) +
|
||||
polynomial_size / 2];
|
||||
}
|
||||
|
||||
void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
int sw_size = polynomial_size / 2;
|
||||
short *sw1_h, *sw2_h;
|
||||
|
||||
sw1_h = (short *)malloc(sizeof(short) * sw_size);
|
||||
sw2_h = (short *)malloc(sizeof(short) * sw_size);
|
||||
|
||||
memset(sw1_h, 0, sw_size * sizeof(short));
|
||||
memset(sw2_h, 0, sw_size * sizeof(short));
|
||||
int cnt = 0;
|
||||
for (int i = 1, j = 0; i < polynomial_size / 2; i++) {
|
||||
int bit = (polynomial_size / 2) >> 1;
|
||||
for (; j & bit; bit >>= 1)
|
||||
j ^= bit;
|
||||
j ^= bit;
|
||||
|
||||
if (i < j) {
|
||||
sw1_h[cnt] = i;
|
||||
sw2_h[cnt] = j;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
cudaMemcpyToSymbol(SW1, sw1_h, sw_size * sizeof(short), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
cudaMemcpyToSymbol(SW2, sw2_h, sw_size * sizeof(short), 0,
|
||||
cudaMemcpyHostToDevice);
|
||||
free(sw1_h);
|
||||
free(sw2_h);
|
||||
}
|
||||
|
||||
template <typename T, typename ST>
|
||||
void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
|
||||
cudaSetDevice(gpu_index);
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
int total_polynomials =
|
||||
input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
|
||||
l_gadget;
|
||||
|
||||
// Here the buffer size is the size of double2 times the number of polynomials
|
||||
// times the polynomial size over 2 because the polynomials are compressed
|
||||
// into the complex domain to perform the FFT
|
||||
size_t buffer_size = total_polynomials * polynomial_size / 2 * sizeof
|
||||
(double2);
|
||||
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / choose_opt(polynomial_size);
|
||||
// todo(Joao): let's use cudaMallocHost here,
|
||||
// since it allocates page-staged memory which allows
|
||||
// faster data copy
|
||||
double2 *h_bsk = (double2 *)malloc(buffer_size);
|
||||
double2 *d_bsk;
|
||||
cudaMalloc((void **)&d_bsk, buffer_size);
|
||||
|
||||
// compress real bsk to complex and divide it on DOUBLE_MAX
|
||||
for (int i = 0; i < total_polynomials; i++) {
|
||||
int complex_current_poly_idx = i * polynomial_size / 2;
|
||||
int torus_current_poly_idx = i * polynomial_size;
|
||||
for (int j = 0; j < polynomial_size / 2; j++) {
|
||||
h_bsk[complex_current_poly_idx + j].x =
|
||||
src[torus_current_poly_idx + 2 * j];
|
||||
h_bsk[complex_current_poly_idx + j].y =
|
||||
src[torus_current_poly_idx + 2 * j + 1];
|
||||
h_bsk[complex_current_poly_idx + j].x /=
|
||||
(double)std::numeric_limits<T>::max();
|
||||
h_bsk[complex_current_poly_idx + j].y /=
|
||||
(double)std::numeric_limits<T>::max();
|
||||
}
|
||||
}
|
||||
|
||||
cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
switch (polynomial_size) {
|
||||
// FIXME (Agnes): check if polynomial sizes are ok
|
||||
case 512:
|
||||
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 1024:
|
||||
batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 2048:
|
||||
batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 4096:
|
||||
batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
case 8192:
|
||||
batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
|
||||
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
cudaFree(d_bsk);
|
||||
free(h_bsk);
|
||||
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>((double2 *)dest, (int32_t *)src,
|
||||
v_stream, gpu_index, input_lwe_dim,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
}
|
||||
|
||||
void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
|
||||
uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
|
||||
uint32_t l_gadget, uint32_t polynomial_size) {
|
||||
cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>((double2 *)dest, (int64_t *)src,
|
||||
v_stream, gpu_index, input_lwe_dim,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
}
|
||||
|
||||
#endif // CNCRT_BSK_H
|
||||
91
src/crypto/gadget.cuh
Normal file
91
src/crypto/gadget.cuh
Normal file
@@ -0,0 +1,91 @@
|
||||
#ifndef CNCRT_CRYPTO_H
|
||||
#define CNCRT_CRPYTO_H
|
||||
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include <cstdint>
|
||||
|
||||
template <typename T, class params> class GadgetMatrix {
|
||||
private:
|
||||
uint32_t l_gadget;
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
T offset;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrix(uint32_t base_log, uint32_t l_gadget)
|
||||
: base_log(base_log), l_gadget(l_gadget) {
|
||||
uint32_t bg = 1 << base_log;
|
||||
this->halfbg = bg / 2;
|
||||
this->mask = bg - 1;
|
||||
T temp = 0;
|
||||
for (int i = 0; i < this->l_gadget; i++) {
|
||||
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
|
||||
}
|
||||
this->offset = temp * this->halfbg;
|
||||
}
|
||||
|
||||
template <typename V, typename U>
|
||||
__device__ void decompose_one_level(Polynomial<V, params> &result,
|
||||
Polynomial<U, params> &polynomial,
|
||||
uint32_t level) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
T s = polynomial.coefficients[tid] + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
result.coefficients[tid] = (V)(temp1 - this->halfbg);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
template <typename V, typename U>
|
||||
__device__ void decompose_one_level(V *result, U *polynomial,
|
||||
uint32_t level) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
T s = polynomial[tid] + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
result[tid] = (V)(temp1 - this->halfbg);
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ T decompose_one_level_single(T element, uint32_t level) {
|
||||
T s = element + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
return (T)(temp1 - this->halfbg);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T> class GadgetMatrixSingle {
|
||||
private:
|
||||
uint32_t l_gadget;
|
||||
uint32_t base_log;
|
||||
uint32_t mask;
|
||||
uint32_t halfbg;
|
||||
T offset;
|
||||
|
||||
public:
|
||||
__device__ GadgetMatrixSingle(uint32_t base_log, uint32_t l_gadget)
|
||||
: base_log(base_log), l_gadget(l_gadget) {
|
||||
uint32_t bg = 1 << base_log;
|
||||
this->halfbg = bg / 2;
|
||||
this->mask = bg - 1;
|
||||
T temp = 0;
|
||||
for (int i = 0; i < this->l_gadget; i++) {
|
||||
temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
|
||||
}
|
||||
this->offset = temp * this->halfbg;
|
||||
}
|
||||
|
||||
__device__ T decompose_one_level_single(T element, uint32_t level) {
|
||||
T s = element + this->offset;
|
||||
uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
|
||||
T temp1 = (s >> decal) & this->mask;
|
||||
return (T)(temp1 - this->halfbg);
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_CRPYTO_H
|
||||
45
src/crypto/torus.cuh
Normal file
45
src/crypto/torus.cuh
Normal file
@@ -0,0 +1,45 @@
|
||||
#ifndef CNCRT_TORUS_H
|
||||
#define CNCRT_TORUS_H
|
||||
|
||||
#include "types/int128.cuh"
|
||||
#include <limits>
|
||||
|
||||
template <typename Torus>
|
||||
__device__ inline Torus typecast_double_to_torus(double x) {
|
||||
if constexpr (sizeof(Torus) < 8) {
|
||||
// this simple cast works up to 32 bits, afterwards we must do it manually
|
||||
long long ret = x;
|
||||
return (Torus)ret;
|
||||
} else {
|
||||
int128 nnnn = make_int128_from_float(x);
|
||||
uint64_t lll = nnnn.lo_;
|
||||
return lll;
|
||||
}
|
||||
// nvcc doesn't get it that the if {} else {} above should always return
|
||||
// something, and complains that this function might return nothing, so we
|
||||
// put this useless return here
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
|
||||
uint32_t l_gadget) {
|
||||
T shift = sizeof(T) * 8 - l_gadget * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
return res;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__device__ __forceinline__ T rescale_torus_element(T element,
|
||||
uint32_t log_shift) {
|
||||
// todo(Joao): not sure if this works
|
||||
// return element >> log_shift;
|
||||
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
|
||||
(double)log_shift);
|
||||
}
|
||||
|
||||
#endif // CNCRT_TORUS_H
|
||||
147
src/device.cu
Normal file
147
src/device.cu
Normal file
@@ -0,0 +1,147 @@
|
||||
#include "device.h"
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cuda_runtime.h>
|
||||
#include <helper_cuda.h>
|
||||
|
||||
/// Unsafe function to create a CUDA stream, must check first that GPU exists
|
||||
void *cuda_create_stream(uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaStream_t *stream = new cudaStream_t;
|
||||
cudaStreamCreate(stream);
|
||||
return stream;
|
||||
}
|
||||
|
||||
/// Unsafe function to destroy CUDA stream, must check first the GPU exists
|
||||
int cuda_destroy_stream(void *v_stream, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaStreamDestroy(*stream);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Unsafe function that will try to allocate even if gpu_index is invalid
|
||||
/// or if there's not enough memory. A safe wrapper around it must call
|
||||
/// cuda_check_valid_malloc() first
|
||||
void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
cudaSetDevice(gpu_index);
|
||||
void *ptr;
|
||||
checkCudaErrors(cudaMalloc((void **)&ptr, size));
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/// Checks that allocation is valid
|
||||
/// 0: valid
|
||||
/// -1: invalid, not enough memory in device
|
||||
/// -2: invalid, gpu index doesn't exist
|
||||
int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
|
||||
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
size_t total_mem, free_mem;
|
||||
cudaMemGetInfo(&free_mem, &total_mem);
|
||||
if (size > free_mem) {
|
||||
// error code: not enough memory
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, dest);
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice,
|
||||
*stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Synchronizes device
|
||||
/// 0: success
|
||||
/// -2: error, gpu index doesn't exist
|
||||
int cuda_synchronize_device(uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Tries to copy memory to the GPU asynchronously
|
||||
/// 0: success
|
||||
/// -1: error, invalid device pointer
|
||||
/// -2: error, gpu index doesn't exist
|
||||
int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
|
||||
void *v_stream, uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaPointerAttributes attr;
|
||||
cudaPointerGetAttributes(&attr, src);
|
||||
if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
|
||||
// error code: invalid device pointer
|
||||
return -1;
|
||||
}
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost,
|
||||
*stream));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Return number of GPUs available
|
||||
int cuda_get_number_of_gpus() {
|
||||
int num_gpus;
|
||||
cudaGetDeviceCount(&num_gpus);
|
||||
return num_gpus;
|
||||
}
|
||||
|
||||
/// Drop a cuda array
|
||||
int cuda_drop(void *ptr, uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
checkCudaErrors(cudaFree(ptr));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Get the maximum size for the shared memory
|
||||
int cuda_get_max_shared_memory(uint32_t gpu_index) {
|
||||
if (gpu_index >= cuda_get_number_of_gpus()) {
|
||||
// error code: invalid gpu_index
|
||||
return -2;
|
||||
}
|
||||
cudaSetDevice(gpu_index);
|
||||
cudaDeviceProp prop;
|
||||
cudaGetDeviceProperties(&prop, gpu_index);
|
||||
int max_shared_memory = 0;
|
||||
if (prop.major > 7) {
|
||||
max_shared_memory = prop.sharedMemPerMultiprocessor;
|
||||
} else {
|
||||
max_shared_memory = prop.sharedMemPerBlock;
|
||||
}
|
||||
return max_shared_memory;
|
||||
}
|
||||
753
src/fft/bnsmfft.cuh
Normal file
753
src/fft/bnsmfft.cuh
Normal file
@@ -0,0 +1,753 @@
|
||||
#ifndef GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
#define GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
|
||||
#include "complex/operations.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "twiddles.cuh"
|
||||
|
||||
/*
|
||||
* bit reverse
|
||||
* coefficient bits are reversed based on precalculated indexes
|
||||
* SW1 and SW2
|
||||
*/
|
||||
template <class params> __device__ void bit_reverse_inplace(double2 *A) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
short sw1 = SW1[tid];
|
||||
short sw2 = SW2[tid];
|
||||
double2 tmp = A[sw1];
|
||||
A[sw1] = A[sw2];
|
||||
A[sw2] = tmp;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* negacyclic twiddle
|
||||
* returns negacyclic twiddle based on degree and index
|
||||
* twiddles are precalculated inside negTwids{3..13} arrays
|
||||
*/
|
||||
template <int degree> __device__ double2 negacyclic_twiddle(int j) {
|
||||
double2 twid;
|
||||
switch (degree) {
|
||||
case 512:
|
||||
twid = negTwids9[j];
|
||||
break;
|
||||
case 1024:
|
||||
twid = negTwids10[j];
|
||||
break;
|
||||
case 2048:
|
||||
twid = negTwids11[j];
|
||||
break;
|
||||
case 4096:
|
||||
twid = negTwids12[j];
|
||||
break;
|
||||
case 8192:
|
||||
twid = negTwids13[j];
|
||||
default:
|
||||
twid.x = 0;
|
||||
twid.y = 0;
|
||||
break;
|
||||
}
|
||||
return twid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Direct negacyclic FFT:
|
||||
* - before the FFT the N real coefficients are stored into a
|
||||
* N/2 sized complex with the even coefficients in the real part
|
||||
* and the odd coefficients in the imaginary part. This is referred to
|
||||
* as the half-size FFT
|
||||
* - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
|
||||
* opt is divided by 2 because the butterfly pattern is always applied
|
||||
* between pairs of coefficients
|
||||
* - instead of twisting each coefficient A_j before the FFT by
|
||||
* multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
|
||||
* the FFT is modified, and for each level k of the FFT the twiddle:
|
||||
* w_j,k = exp(-i pi j/2^k)
|
||||
* is replaced with:
|
||||
* \zeta_j,k = exp(-i pi (2j-1)/2^k)
|
||||
* - this technique also implies a correction of the
|
||||
* complex obtained after the FFT, which is done in the
|
||||
* forward_negacyclic_fft_inplace function of bootstrap.cuh
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_direct(double2 *A) {
|
||||
/* First, reverse the bits of the input complex
|
||||
* The bit reversal for half-size FFT has been stored into the
|
||||
* SW1 and SW2 arrays beforehand
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance TODO (Agnes) check this
|
||||
*/
|
||||
bit_reverse_inplace<params>(A);
|
||||
__syncthreads();
|
||||
|
||||
// Now we go through all the levels of the FFT one by one
|
||||
// (instead of recursively)
|
||||
// first iteration: k=1, zeta=i for all coefficients
|
||||
int tid = threadIdx.x;
|
||||
int i1, i2;
|
||||
double2 u, v;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2
|
||||
i1 = tid << 1;
|
||||
i2 = i1 + 1;
|
||||
u = A[i1];
|
||||
// v = i*A[i2]
|
||||
v.y = A[i2].x;
|
||||
v.x = -A[i2].y;
|
||||
// A[i1] <- A[i1] + i*A[i2]
|
||||
// A[i2] <- A[i1] - i*A[i2]
|
||||
A[i1] += v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// second iteration: apply the butterfly pattern
|
||||
// between groups of 4 coefficients
|
||||
// k=2, \zeta=exp(i pi/4) for even coefficients and
|
||||
// exp(3 i pi / 4) for odd coefficients
|
||||
// TODO (Agnes) how does this work on the gpu? aren't we doing
|
||||
// a lot more computations than we should?
|
||||
tid = threadIdx.x;
|
||||
// odd = 0 for even coefficients, 1 for odd coefficients
|
||||
int odd = tid & 1;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2
|
||||
// i1=2*tid if tid is even and 2*tid-1 if it is odd
|
||||
i1 = (tid << 1) - odd;
|
||||
i2 = i1 + 2;
|
||||
|
||||
double a = A[i2].x;
|
||||
double b = A[i2].y;
|
||||
u = A[i1];
|
||||
|
||||
// \zeta_j,2 = exp(-i pi (2j-1)/4) -> j=0: exp(i pi/4) or j=1: exp(-i pi/4)
|
||||
// \zeta_even = sqrt(2)/2 + i * sqrt(2)/2 = sqrt(2)/2*(1+i)
|
||||
// \zeta_odd = sqrt(2)/2 - i * sqrt(2)/2 = sqrt(2)/2*(1-i)
|
||||
|
||||
// v_j = \zeta_j * (a+i*b)
|
||||
// v_even = sqrt(2)/2*((a-b)+i*(a+b))
|
||||
// v_odd = sqrt(2)/2*(a+b+i*(b-a))
|
||||
v.x =
|
||||
(odd) ? (-0.707106781186548) * (a + b) : (0.707106781186548) * (a - b);
|
||||
v.y = (odd) ? (0.707106781186548) * (a - b) : (0.707106781186548) * (a + b);
|
||||
|
||||
// v.x = (0.707106781186548 * odd) * (a + b) + (0.707106781186548 * (!odd))
|
||||
// * (a - b); v.y = (0.707106781186548 * odd) * (b - a) + (0.707106781186548
|
||||
// * (!odd)) * (a + b);
|
||||
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// third iteration
|
||||
// from k=3 on, we have to do the full complex multiplication
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 4
|
||||
// rem is the remainder of tid/4. tid takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, ... N/4
|
||||
// then rem takes values:
|
||||
// 0, 1, 2, 3, 0, 1, 2, 3, ... N/4
|
||||
// and striding by 4 will allow us to cover all
|
||||
// the coefficients correctly
|
||||
int rem = tid & 3;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 4;
|
||||
|
||||
double2 w = negTwids3[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 4_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 8
|
||||
// rem is the remainder of tid/8. tid takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... N/4
|
||||
// then rem takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, ... N/4
|
||||
// and striding by 8 will allow us to cover all
|
||||
// the coefficients correctly
|
||||
int rem = tid & 7;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 8;
|
||||
|
||||
double2 w = negTwids4[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 5_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 16
|
||||
// rem is the remainder of tid/16
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 15;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 16;
|
||||
double2 w = negTwids5[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 6_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 32
|
||||
// rem is the remainder of tid/32
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 31;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 32;
|
||||
double2 w = negTwids6[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 7_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 64
|
||||
// rem is the remainder of tid/64
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 63;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 64;
|
||||
double2 w = negTwids7[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 8_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 128
|
||||
// rem is the remainder of tid/128
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 127;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 128;
|
||||
double2 w = negTwids8[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
if constexpr (params::log2_degree > 8) {
|
||||
// 9_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 256
|
||||
// rem is the remainder of tid/256
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 255;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 256;
|
||||
double2 w = negTwids9[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
if constexpr (params::log2_degree > 9) {
|
||||
// 10_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 512
|
||||
// rem is the remainder of tid/512
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 511;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 512;
|
||||
double2 w = negTwids10[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 10) {
|
||||
// 11_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 1024
|
||||
// rem is the remainder of tid/1024
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 1023;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 1024;
|
||||
double2 w = negTwids11[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 11) {
|
||||
// 12_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2048
|
||||
// rem is the remainder of tid/2048
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 2047;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 2048;
|
||||
double2 w = negTwids12[rem];
|
||||
u = A[i1], v = A[i2] * w;
|
||||
A[i1] = u + v;
|
||||
A[i2] = u - v;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
// Real polynomials handled should not exceed a degree of 8192
|
||||
}
|
||||
|
||||
/*
|
||||
* negacyclic inverse fft
|
||||
*/
|
||||
template <class params> __device__ void NSMFFT_inverse(double2 *A) {
|
||||
/* First, reverse the bits of the input complex
|
||||
* The bit reversal for half-size FFT has been stored into the
|
||||
* SW1 and SW2 arrays beforehand
|
||||
* Each thread is always in charge of "opt/2" pairs of coefficients,
|
||||
* which is why we always loop through N/2 by N/opt strides
|
||||
* The pragma unroll instruction tells the compiler to unroll the
|
||||
* full loop, which should increase performance TODO (Agnes) check this
|
||||
*/
|
||||
int tid;
|
||||
int i1, i2;
|
||||
double2 u, v;
|
||||
if constexpr (params::log2_degree > 11) {
|
||||
// 12_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2048
|
||||
// rem is the remainder of tid/2048
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 2047;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 2048;
|
||||
double2 w = conjugate(negTwids12[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 10) {
|
||||
// 11_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 1024
|
||||
// rem is the remainder of tid/1024
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 1023;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 1024;
|
||||
double2 w = conjugate(negTwids11[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 9) {
|
||||
// 10_th iteration
|
||||
tid = threadIdx.x;
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 512
|
||||
// rem is the remainder of tid/512
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 511;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 512;
|
||||
double2 w = conjugate(negTwids10[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if constexpr (params::log2_degree > 8) {
|
||||
// 9_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 256
|
||||
// rem is the remainder of tid/256
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 255;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 256;
|
||||
double2 w = conjugate(negTwids9[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// 8_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 128
|
||||
// rem is the remainder of tid/128
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 127;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 128;
|
||||
double2 w = conjugate(negTwids8[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 7_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 64
|
||||
// rem is the remainder of tid/64
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 63;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 64;
|
||||
double2 w = conjugate(negTwids7[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 6_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 32
|
||||
// rem is the remainder of tid/32
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 31;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 32;
|
||||
double2 w = conjugate(negTwids6[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 5_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 16
|
||||
// rem is the remainder of tid/16
|
||||
// and the same logic as for previous iterations applies
|
||||
int rem = tid & 15;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 16;
|
||||
double2 w = conjugate(negTwids5[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// 4_th iteration
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 8
|
||||
// rem is the remainder of tid/8. tid takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... N/4
|
||||
// then rem takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, ... N/4
|
||||
// and striding by 8 will allow us to cover all
|
||||
// the coefficients correctly
|
||||
int rem = tid & 7;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 8;
|
||||
|
||||
double2 w = conjugate(negTwids4[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// third iteration
|
||||
// from k=3 on, we have to do the full complex multiplication
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 4
|
||||
// rem is the remainder of tid/4. tid takes values:
|
||||
// 0, 1, 2, 3, 4, 5, 6, 7, ... N/4
|
||||
// then rem takes values:
|
||||
// 0, 1, 2, 3, 0, 1, 2, 3, ... N/4
|
||||
// and striding by 4 will allow us to cover all
|
||||
// the coefficients correctly
|
||||
int rem = tid & 3;
|
||||
i1 = (tid << 1) - rem;
|
||||
i2 = i1 + 4;
|
||||
|
||||
double2 w = conjugate(negTwids3[rem]);
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// second iteration: apply the butterfly pattern
|
||||
// between groups of 4 coefficients
|
||||
// k=2, \zeta=exp(i pi/4) for even coefficients and
|
||||
// exp(3 i pi / 4) for odd coefficients
|
||||
// TODO (Agnes) how does this work on the gpu? aren't we doing
|
||||
// a lot more computations than we should?
|
||||
tid = threadIdx.x;
|
||||
// odd = 0 for even coefficients, 1 for odd coefficients
|
||||
int odd = tid & 1;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2
|
||||
// i1=2*tid if tid is even and 2*tid-1 if it is odd
|
||||
i1 = (tid << 1) - odd;
|
||||
i2 = i1 + 2;
|
||||
|
||||
// TODO(Beka) optimize twiddle multiplication
|
||||
double2 w;
|
||||
if (odd) {
|
||||
w.x = -0.707106781186547461715008466854;
|
||||
w.y = -0.707106781186547572737310929369;
|
||||
} else {
|
||||
w.x = 0.707106781186547461715008466854;
|
||||
w.y = -0.707106781186547572737310929369;
|
||||
}
|
||||
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Now we go through all the levels of the FFT one by one
|
||||
// (instead of recursively)
|
||||
// first iteration: k=1, zeta=i for all coefficients
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
// the butterfly pattern is applied to each pair
|
||||
// of coefficients, with a stride of 2
|
||||
i1 = tid << 1;
|
||||
i2 = i1 + 1;
|
||||
// TODO(Beka) optimize twiddle multiplication
|
||||
double2 w = {0, -1};
|
||||
u = A[i1], v = A[i2];
|
||||
A[i1] = (u + v) * 0.5;
|
||||
A[i2] = (u - v) * w * 0.5;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
bit_reverse_inplace<params>(A);
|
||||
__syncthreads();
|
||||
// Real polynomials handled should not exceed a degree of 8192
|
||||
}
|
||||
|
||||
/*
|
||||
* correction after direct fft
|
||||
* does not use extra shared memory for recovering
|
||||
* correction is done using registers.
|
||||
* based on Pascal's paper
|
||||
*/
|
||||
template <class params>
|
||||
__device__ void correction_direct_fft_inplace(double2 *x) {
|
||||
constexpr int threads = params::degree / params::opt;
|
||||
int tid = threadIdx.x;
|
||||
double2 left[params::opt / 4];
|
||||
double2 right[params::opt / 4];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
left[i] = x[tid + i * threads];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
right[i] = x[params::degree / 2 - (tid + i * threads + 1)];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
double2 tw = negacyclic_twiddle<params::degree>(tid + i * threads);
|
||||
double add_RE = left[i].x + right[i].x;
|
||||
double sub_RE = left[i].x - right[i].x;
|
||||
double add_IM = left[i].y + right[i].y;
|
||||
double sub_IM = left[i].y - right[i].y;
|
||||
|
||||
double tmp1 = add_IM * tw.x + sub_RE * tw.y;
|
||||
double tmp2 = -sub_RE * tw.x + add_IM * tw.y;
|
||||
x[tid + i * threads].x = (add_RE + tmp1) * 0.5;
|
||||
x[tid + i * threads].y = (sub_IM + tmp2) * 0.5;
|
||||
x[params::degree / 2 - (tid + i * threads + 1)].x = (add_RE - tmp1) * 0.5;
|
||||
x[params::degree / 2 - (tid + i * threads + 1)].y = (-sub_IM + tmp2) * 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* correction before inverse fft
|
||||
* does not use extra shared memory for recovering
|
||||
* correction is done using registers.
|
||||
* based on Pascal's paper
|
||||
*/
|
||||
template <class params>
|
||||
__device__ void correction_inverse_fft_inplace(double2 *x) {
|
||||
constexpr int threads = params::degree / params::opt;
|
||||
int tid = threadIdx.x;
|
||||
double2 left[params::opt / 4];
|
||||
double2 right[params::opt / 4];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
left[i] = x[tid + i * threads];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
right[i] = x[params::degree / 2 - (tid + i * threads + 1)];
|
||||
}
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 4; i++) {
|
||||
double2 tw = negacyclic_twiddle<params::degree>(tid + i * threads);
|
||||
double add_RE = left[i].x + right[i].x;
|
||||
double sub_RE = left[i].x - right[i].x;
|
||||
double add_IM = left[i].y + right[i].y;
|
||||
double sub_IM = left[i].y - right[i].y;
|
||||
|
||||
double tmp1 = add_IM * tw.x - sub_RE * tw.y;
|
||||
double tmp2 = sub_RE * tw.x + add_IM * tw.y;
|
||||
x[tid + i * threads].x = (add_RE - tmp1) * 0.5;
|
||||
x[tid + i * threads].y = (sub_IM + tmp2) * 0.5;
|
||||
x[params::degree / 2 - (tid + i * threads + 1)].x = (add_RE + tmp1) * 0.5;
|
||||
x[params::degree / 2 - (tid + i * threads + 1)].y = (-sub_IM + tmp2) * 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* global batch fft
|
||||
* does fft in half size
|
||||
* unrolling halfsize fft result in half size + 1 eleemnts
|
||||
* this function must be called with actual degree
|
||||
* function takes as input already compressed input
|
||||
*/
|
||||
template <class params>
|
||||
__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output) {
|
||||
extern __shared__ double2 sharedMemoryFFT[];
|
||||
double2 *fft = sharedMemoryFFT;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
__syncthreads();
|
||||
NSMFFT_direct<HalfDegree<params>>(fft);
|
||||
__syncthreads();
|
||||
correction_direct_fft_inplace<params>(fft);
|
||||
__syncthreads();
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // GPU_BOOTSTRAP_FFT_1024_CUH
|
||||
402
src/fft/smfft.cuh
Normal file
402
src/fft/smfft.cuh
Normal file
@@ -0,0 +1,402 @@
|
||||
/*
|
||||
#ifndef GPU_BOOTSTRAP_SMFFT_CUH
|
||||
#define GPU_BOOTSTRAP_SMFFT_CUH
|
||||
|
||||
#include "../complex/operations.cuh"
|
||||
#include "twiddles.cuh"
|
||||
|
||||
__device__ __inline__ double2 Get_W_value_inverse(int index) {
|
||||
double2 ctemp = _gTwiddles[index];
|
||||
ctemp.y = -ctemp.y;
|
||||
return (ctemp);
|
||||
}
|
||||
template <class params>
|
||||
__device__ double2 Get_after_inverse_fft_twiddle(int index) {
|
||||
double2 ctemp;
|
||||
switch (params::degree) {
|
||||
case 512:
|
||||
ctemp = INVERSE_TWIDDLES_512[index];
|
||||
break;
|
||||
case 1024:
|
||||
ctemp = gTwiddles1024[index];
|
||||
ctemp.x /= params::degree;
|
||||
ctemp.y /= -params::degree;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return ctemp;
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl_xor(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_xor_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl_xor((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ double shfl_down(double *value, int par) {
|
||||
#if (CUDART_VERSION >= 9000)
|
||||
return (__shfl_down_sync(0xffffffff, (*value), par));
|
||||
#else
|
||||
return (__shfl_down((*value), par));
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ __inline__ void
|
||||
reorder_16_register(double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value, int *local_id) {
|
||||
double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
|
||||
unsigned int target = (((unsigned int)__brev(((*local_id) & 15))) >> (28)) +
|
||||
16 * ((*local_id) >> 4);
|
||||
Af2temp.x = shfl(&(A_DFT_value->x), target);
|
||||
Af2temp.y = shfl(&(A_DFT_value->y), target);
|
||||
Bf2temp.x = shfl(&(B_DFT_value->x), target);
|
||||
Bf2temp.y = shfl(&(B_DFT_value->y), target);
|
||||
Cf2temp.x = shfl(&(C_DFT_value->x), target);
|
||||
Cf2temp.y = shfl(&(C_DFT_value->y), target);
|
||||
Df2temp.x = shfl(&(D_DFT_value->x), target);
|
||||
Df2temp.y = shfl(&(D_DFT_value->y), target);
|
||||
__syncwarp();
|
||||
(*A_DFT_value) = Af2temp;
|
||||
(*B_DFT_value) = Bf2temp;
|
||||
(*C_DFT_value) = Cf2temp;
|
||||
(*D_DFT_value) = Df2temp;
|
||||
}
|
||||
|
||||
__device__ __inline__ void reorder_32_register(double2 *A_DFT_value,
|
||||
double2 *B_DFT_value,
|
||||
double2 *C_DFT_value,
|
||||
double2 *D_DFT_value) {
|
||||
double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
|
||||
unsigned int target = ((unsigned int)__brev(threadIdx.x)) >> (27);
|
||||
Af2temp.x = shfl(&(A_DFT_value->x), target);
|
||||
Af2temp.y = shfl(&(A_DFT_value->y), target);
|
||||
Bf2temp.x = shfl(&(B_DFT_value->x), target);
|
||||
Bf2temp.y = shfl(&(B_DFT_value->y), target);
|
||||
Cf2temp.x = shfl(&(C_DFT_value->x), target);
|
||||
Cf2temp.y = shfl(&(C_DFT_value->y), target);
|
||||
Df2temp.x = shfl(&(D_DFT_value->x), target);
|
||||
Df2temp.y = shfl(&(D_DFT_value->y), target);
|
||||
__syncwarp();
|
||||
(*A_DFT_value) = Af2temp;
|
||||
(*B_DFT_value) = Bf2temp;
|
||||
(*C_DFT_value) = Cf2temp;
|
||||
(*D_DFT_value) = Df2temp;
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ __inline__ void
|
||||
reorder_512(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value) {
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
|
||||
// reorder elements within warp so we can save them in semi-transposed manner
|
||||
// into shared memory
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
|
||||
// reorder elements within warp so we can save them in semi-transposed manner
|
||||
// into shared memory
|
||||
__syncthreads();
|
||||
unsigned int sm_store_pos =
|
||||
(local_id >> 1) + 16 * (local_id & 1) + warp_id * 132;
|
||||
s_input[sm_store_pos] = *A_DFT_value;
|
||||
s_input[sm_store_pos + 33] = *B_DFT_value;
|
||||
s_input[66 + sm_store_pos] = *C_DFT_value;
|
||||
s_input[66 + sm_store_pos + 33] = *D_DFT_value;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Read shared memory to get reordered input
|
||||
unsigned int sm_read_pos = (local_id & 15) * 32 + local_id + warp_id * 4;
|
||||
__syncthreads();
|
||||
*A_DFT_value = s_input[sm_read_pos + 0];
|
||||
*B_DFT_value = s_input[sm_read_pos + 1];
|
||||
*C_DFT_value = s_input[sm_read_pos + 2];
|
||||
*D_DFT_value = s_input[sm_read_pos + 3];
|
||||
|
||||
__syncthreads();
|
||||
reorder_16_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value,
|
||||
&local_id);
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ __inline__ void
|
||||
reorder_1024(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
|
||||
double2 *C_DFT_value, double2 *D_DFT_value) {
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
|
||||
// reorder elements within params::warp so we can save them in semi-transposed
|
||||
// manner into shared memory
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
|
||||
// reorder elements within params::warp so we can save them in semi-transposed
|
||||
// manner into shared memory
|
||||
__syncthreads();
|
||||
unsigned int sm_store_pos =
|
||||
(local_id >> 0) + 32 * (local_id & 0) + warp_id * 132;
|
||||
s_input[sm_store_pos] = *A_DFT_value;
|
||||
s_input[sm_store_pos + 33] = *B_DFT_value;
|
||||
s_input[66 + sm_store_pos] = *C_DFT_value;
|
||||
s_input[66 + sm_store_pos + 33] = *D_DFT_value;
|
||||
|
||||
// Read shared memory to get reordered input
|
||||
unsigned int sm_read_pos = (local_id & 31) * 32 + local_id + warp_id * 4;
|
||||
__syncthreads();
|
||||
*A_DFT_value = s_input[sm_read_pos + 0];
|
||||
*B_DFT_value = s_input[sm_read_pos + 1];
|
||||
*C_DFT_value = s_input[sm_read_pos + 2];
|
||||
*D_DFT_value = s_input[sm_read_pos + 3];
|
||||
|
||||
__syncthreads();
|
||||
reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
|
||||
}
|
||||
|
||||
__device__ bool printOnce = true;
|
||||
|
||||
template <class params> __device__ void do_SMFFT_CT_DIT(double2 *s_input) {
|
||||
double2 A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value;
|
||||
double2 W;
|
||||
double2 Aftemp, Bftemp, Cftemp, Dftemp;
|
||||
|
||||
int j, m_param;
|
||||
int parity, itemp;
|
||||
int A_read_index, B_read_index, C_read_index, D_read_index;
|
||||
int PoT, PoTp1, q;
|
||||
|
||||
int local_id = threadIdx.x & (params::warp - 1);
|
||||
int warp_id = threadIdx.x / params::warp;
|
||||
A_DFT_value = s_input[local_id + (warp_id << 2) * params::warp];
|
||||
B_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + params::warp];
|
||||
C_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + 2 * params::warp];
|
||||
D_DFT_value =
|
||||
s_input[local_id + (warp_id << 2) * params::warp + 3 * params::warp];
|
||||
|
||||
switch (params::log2_degree) {
|
||||
case 9:
|
||||
reorder_512<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
|
||||
&D_DFT_value);
|
||||
break;
|
||||
case 10:
|
||||
reorder_1024<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
|
||||
&D_DFT_value);
|
||||
break;
|
||||
// case 11:
|
||||
// reorder_2048<params, opt>(s_input, &A_DFT_value, &B_DFT_value,
|
||||
//&C_DFT_value, &D_DFT_value); break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
//----> FFT
|
||||
PoT = 1;
|
||||
PoTp1 = 2;
|
||||
|
||||
//--> First iteration
|
||||
itemp = local_id & 1;
|
||||
parity = (1 - itemp * 2);
|
||||
|
||||
A_DFT_value.x = parity * A_DFT_value.x + shfl_xor(&A_DFT_value.x, 1);
|
||||
A_DFT_value.y = parity * A_DFT_value.y + shfl_xor(&A_DFT_value.y, 1);
|
||||
B_DFT_value.x = parity * B_DFT_value.x + shfl_xor(&B_DFT_value.x, 1);
|
||||
B_DFT_value.y = parity * B_DFT_value.y + shfl_xor(&B_DFT_value.y, 1);
|
||||
C_DFT_value.x = parity * C_DFT_value.x + shfl_xor(&C_DFT_value.x, 1);
|
||||
C_DFT_value.y = parity * C_DFT_value.y + shfl_xor(&C_DFT_value.y, 1);
|
||||
D_DFT_value.x = parity * D_DFT_value.x + shfl_xor(&D_DFT_value.x, 1);
|
||||
D_DFT_value.y = parity * D_DFT_value.y + shfl_xor(&D_DFT_value.y, 1);
|
||||
|
||||
//--> Second through Fifth iteration (no synchronization)
|
||||
PoT = 2;
|
||||
PoTp1 = 4;
|
||||
for (q = 1; q < 5; q++) {
|
||||
m_param = (local_id & (PoTp1 - 1));
|
||||
itemp = m_param >> q;
|
||||
parity = ((itemp << 1) - 1);
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + itemp * m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + itemp * m_param];
|
||||
Aftemp.x = W.x * A_DFT_value.x - W.y * A_DFT_value.y;
|
||||
Aftemp.y = W.x * A_DFT_value.y + W.y * A_DFT_value.x;
|
||||
Bftemp.x = W.x * B_DFT_value.x - W.y * B_DFT_value.y;
|
||||
Bftemp.y = W.x * B_DFT_value.y + W.y * B_DFT_value.x;
|
||||
Cftemp.x = W.x * C_DFT_value.x - W.y * C_DFT_value.y;
|
||||
Cftemp.y = W.x * C_DFT_value.y + W.y * C_DFT_value.x;
|
||||
Dftemp.x = W.x * D_DFT_value.x - W.y * D_DFT_value.y;
|
||||
Dftemp.y = W.x * D_DFT_value.y + W.y * D_DFT_value.x;
|
||||
|
||||
A_DFT_value.x = Aftemp.x + parity * shfl_xor(&Aftemp.x, PoT);
|
||||
A_DFT_value.y = Aftemp.y + parity * shfl_xor(&Aftemp.y, PoT);
|
||||
B_DFT_value.x = Bftemp.x + parity * shfl_xor(&Bftemp.x, PoT);
|
||||
B_DFT_value.y = Bftemp.y + parity * shfl_xor(&Bftemp.y, PoT);
|
||||
C_DFT_value.x = Cftemp.x + parity * shfl_xor(&Cftemp.x, PoT);
|
||||
C_DFT_value.y = Cftemp.y + parity * shfl_xor(&Cftemp.y, PoT);
|
||||
D_DFT_value.x = Dftemp.x + parity * shfl_xor(&Dftemp.x, PoT);
|
||||
D_DFT_value.y = Dftemp.y + parity * shfl_xor(&Dftemp.y, PoT);
|
||||
|
||||
PoT = PoT << 1;
|
||||
PoTp1 = PoTp1 << 1;
|
||||
}
|
||||
|
||||
itemp = local_id + (warp_id << 2) * params::warp;
|
||||
s_input[itemp] = A_DFT_value;
|
||||
s_input[itemp + params::warp] = B_DFT_value;
|
||||
s_input[itemp + 2 * params::warp] = C_DFT_value;
|
||||
s_input[itemp + 3 * params::warp] = D_DFT_value;
|
||||
|
||||
for (q = 5; q < (params::log2_degree - 1); q++) {
|
||||
__syncthreads();
|
||||
m_param = threadIdx.x & (PoT - 1);
|
||||
j = threadIdx.x >> q;
|
||||
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + m_param];
|
||||
|
||||
A_read_index = j * (PoTp1 << 1) + m_param;
|
||||
B_read_index = j * (PoTp1 << 1) + m_param + PoT;
|
||||
C_read_index = j * (PoTp1 << 1) + m_param + PoTp1;
|
||||
D_read_index = j * (PoTp1 << 1) + m_param + 3 * PoT;
|
||||
|
||||
Aftemp = s_input[A_read_index];
|
||||
Bftemp = s_input[B_read_index];
|
||||
A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
|
||||
A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
|
||||
B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
|
||||
B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
|
||||
|
||||
Cftemp = s_input[C_read_index];
|
||||
Dftemp = s_input[D_read_index];
|
||||
C_DFT_value.x = Cftemp.x + W.x * Dftemp.x - W.y * Dftemp.y;
|
||||
C_DFT_value.y = Cftemp.y + W.x * Dftemp.y + W.y * Dftemp.x;
|
||||
D_DFT_value.x = Cftemp.x - W.x * Dftemp.x + W.y * Dftemp.y;
|
||||
D_DFT_value.y = Cftemp.y - W.x * Dftemp.y - W.y * Dftemp.x;
|
||||
|
||||
s_input[A_read_index] = A_DFT_value;
|
||||
s_input[B_read_index] = B_DFT_value;
|
||||
s_input[C_read_index] = C_DFT_value;
|
||||
s_input[D_read_index] = D_DFT_value;
|
||||
|
||||
PoT = PoT << 1;
|
||||
PoTp1 = PoTp1 << 1;
|
||||
}
|
||||
|
||||
// last iteration
|
||||
if (params::log2_degree > 6) {
|
||||
__syncthreads();
|
||||
m_param = threadIdx.x;
|
||||
|
||||
if (params::fft_direction)
|
||||
W = Get_W_value_inverse((q - 1) * 257 + m_param);
|
||||
else
|
||||
W = _gTwiddles[(q - 1) * 257 + m_param];
|
||||
|
||||
A_read_index = m_param;
|
||||
B_read_index = m_param + PoT;
|
||||
C_read_index = m_param + (PoT >> 1);
|
||||
D_read_index = m_param + 3 * (PoT >> 1);
|
||||
|
||||
Aftemp = s_input[A_read_index];
|
||||
Bftemp = s_input[B_read_index];
|
||||
A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
|
||||
A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
|
||||
B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
|
||||
B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
|
||||
|
||||
Cftemp = s_input[C_read_index];
|
||||
Dftemp = s_input[D_read_index];
|
||||
C_DFT_value.x = Cftemp.x + W.y * Dftemp.x + W.x * Dftemp.y;
|
||||
C_DFT_value.y = Cftemp.y + W.y * Dftemp.y - W.x * Dftemp.x;
|
||||
D_DFT_value.x = Cftemp.x - W.y * Dftemp.x - W.x * Dftemp.y;
|
||||
D_DFT_value.y = Cftemp.y - W.y * Dftemp.y + W.x * Dftemp.x;
|
||||
|
||||
s_input[A_read_index] = A_DFT_value;
|
||||
s_input[B_read_index] = B_DFT_value;
|
||||
s_input[C_read_index] = C_DFT_value;
|
||||
s_input[D_read_index] = D_DFT_value;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__global__ void SMFFT_DIT_external(double2 *d_input, double2 *d_output) {
|
||||
__syncthreads();
|
||||
|
||||
extern __shared__ double2 sharedmemBSK[];
|
||||
|
||||
double2 *s_input = sharedmemBSK;
|
||||
|
||||
int cTid = threadIdx.x * params::opt;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
double2 tmp;
|
||||
switch (params::degree) {
|
||||
case 512:
|
||||
tmp = INVERSE_TWIDDLES_512[cTid];
|
||||
tmp.x *= params::degree;
|
||||
tmp.y *= -params::degree;
|
||||
break;
|
||||
case 1024:
|
||||
tmp = gTwiddles1024[cTid];
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
d_input[blockIdx.x * params::degree + cTid] *= tmp;
|
||||
cTid++;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
s_input[threadIdx.x] = d_input[threadIdx.x + blockIdx.x * params::degree];
|
||||
s_input[threadIdx.x + params::quarter] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree + params::quarter];
|
||||
s_input[threadIdx.x + params::half] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree + params::half];
|
||||
s_input[threadIdx.x + params::three_quarters] =
|
||||
d_input[threadIdx.x + blockIdx.x * params::degree +
|
||||
params::three_quarters];
|
||||
|
||||
__syncthreads();
|
||||
|
||||
do_SMFFT_CT_DIT<params>(s_input);
|
||||
if (threadIdx.x == 0 && blockIdx.x == 0) {
|
||||
for (int i = 0; i < 1024; i++)
|
||||
printf("smfft[%u] %.10f %.10f\n", i, s_input[i].x, s_input[i].y);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
|
||||
|
||||
__syncthreads();
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree] = s_input[threadIdx.x];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::quarter] =
|
||||
s_input[threadIdx.x + params::quarter];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::half] =
|
||||
s_input[threadIdx.x + params::half];
|
||||
d_output[threadIdx.x + blockIdx.x * params::degree + params::three_quarters] =
|
||||
s_input[threadIdx.x + params::three_quarters];
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#endif // GPU_BOOTSTRAP_SMFFT_CUH
|
||||
|
||||
*/
|
||||
8204
src/fft/twiddles.cu
Normal file
8204
src/fft/twiddles.cu
Normal file
File diff suppressed because it is too large
Load Diff
28
src/fft/twiddles.cuh
Normal file
28
src/fft/twiddles.cuh
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
#define GPU_BOOTSTRAP_TWIDDLES_CUH
|
||||
|
||||
// TODO (Agnes) depending on the device architecture
|
||||
// can we make more of them __constant__?
|
||||
// Do we have to define them all regardless of the
|
||||
// polynomial degree and q values?
|
||||
|
||||
// TODO (Beka) make those two arrays with dynamic size
|
||||
// or find exact maximum for 8192 length poly it shuld
|
||||
// be less than 2048
|
||||
extern __constant__ short SW1[2048];
|
||||
extern __constant__ short SW2[2048];
|
||||
|
||||
extern __constant__ double2 negTwids3[4];
|
||||
extern __constant__ double2 negTwids4[8];
|
||||
extern __constant__ double2 negTwids5[16];
|
||||
extern __constant__ double2 negTwids6[32];
|
||||
extern __constant__ double2 negTwids7[64];
|
||||
extern __constant__ double2 negTwids8[128];
|
||||
extern __constant__ double2 negTwids9[256];
|
||||
extern __constant__ double2 negTwids10[512];
|
||||
extern __constant__ double2 negTwids11[1024];
|
||||
extern __device__ double2 negTwids12[2048];
|
||||
extern __device__ double2 negTwids13[4096];
|
||||
|
||||
#endif
|
||||
55
src/keyswitch.cu
Normal file
55
src/keyswitch.cu
Normal file
@@ -0,0 +1,55 @@
|
||||
#include "keyswitch.cuh"
|
||||
#include "keyswitch.h"
|
||||
#include "polynomial/parameters.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
/* Perform keyswitch on a batch of input LWE ciphertexts for 32 bits
|
||||
*
|
||||
* - lwe_out: output batch of num_samples keyswitched ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, static_cast<uint32_t *>(lwe_out), static_cast<uint32_t *>(lwe_in),
|
||||
static_cast<uint32_t*>(ksk),
|
||||
lwe_dimension_before, lwe_dimension_after,
|
||||
base_log, l_gadget,
|
||||
num_samples);
|
||||
}
|
||||
|
||||
/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
|
||||
*
|
||||
* - lwe_out: output batch of num_samples keyswitched ciphertexts c =
|
||||
* (a0,..an-1,b) where n is the LWE dimension
|
||||
* - lwe_in: input batch of num_samples LWE ciphertexts, containing n
|
||||
* mask values + 1 body value
|
||||
*
|
||||
* This function calls a wrapper to a device kernel that performs the keyswitch
|
||||
* - num_samples blocks of threads are launched
|
||||
*/
|
||||
void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
|
||||
void *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
cuda_keyswitch_lwe_ciphertext_vector(
|
||||
v_stream, static_cast<uint64_t *>(lwe_out), static_cast<uint64_t *> (lwe_in),
|
||||
static_cast<uint64_t*>(ksk),
|
||||
lwe_dimension_before, lwe_dimension_after,
|
||||
base_log, l_gadget,
|
||||
num_samples);
|
||||
}
|
||||
|
||||
|
||||
146
src/keyswitch.cuh
Normal file
146
src/keyswitch.cuh
Normal file
@@ -0,0 +1,146 @@
|
||||
#ifndef CNCRT_KS_H
|
||||
#define CNCRT_KS_H
|
||||
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
template <typename Torus>
|
||||
__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t l_gadget) {
|
||||
int pos = i * l_gadget * (lwe_dimension_after + 1) +
|
||||
level * (lwe_dimension_after + 1);
|
||||
Torus *ptr = &ksk[pos];
|
||||
return ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* keyswitch kernel
|
||||
* Each thread handles a piece of the following equation:
|
||||
* $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
|
||||
* (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
|
||||
* the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
|
||||
* equation is solved for each polynomial coefficient. where Dec denotes the
|
||||
* decomposition with base beta and l levels and the inner product is done
|
||||
* between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
|
||||
* with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
|
||||
* scaling factor) under key s2 instead of s1, with an increased noise
|
||||
*
|
||||
*/
|
||||
template <typename Torus>
|
||||
__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
|
||||
Torus *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
int lwe_lower, int lwe_upper, int cutoff) {
|
||||
int tid = threadIdx.x;
|
||||
|
||||
extern __shared__ char sharedmem[];
|
||||
|
||||
Torus *local_lwe_out = (Torus *)sharedmem;
|
||||
|
||||
auto block_lwe_in =
|
||||
get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1);
|
||||
auto block_lwe_out =
|
||||
get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1);
|
||||
|
||||
auto gadget = GadgetMatrixSingle<Torus>(base_log, l_gadget);
|
||||
|
||||
int lwe_part_per_thd;
|
||||
if (tid < cutoff) {
|
||||
lwe_part_per_thd = lwe_upper;
|
||||
} else {
|
||||
lwe_part_per_thd = lwe_lower;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_out[idx] = 0;
|
||||
}
|
||||
|
||||
if (tid == 0) {
|
||||
local_lwe_out[lwe_dimension_after] =
|
||||
block_lwe_in[lwe_dimension_before];
|
||||
}
|
||||
|
||||
for (int i = 0; i < lwe_dimension_before; i++) {
|
||||
|
||||
__syncthreads();
|
||||
|
||||
Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log,
|
||||
l_gadget);
|
||||
|
||||
for (int j = 0; j < l_gadget; j++) {
|
||||
auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_after, l_gadget);
|
||||
Torus decomposed = gadget.decompose_one_level_single(a_i, (uint32_t)j);
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
local_lwe_out[idx] -= (Torus)ksk_block[idx] * decomposed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (int k = 0; k < lwe_part_per_thd; k++) {
|
||||
int idx = tid + k * blockDim.x;
|
||||
block_lwe_out[idx] = local_lwe_out[idx];
|
||||
}
|
||||
}
|
||||
|
||||
/// assume lwe_in in the gpu
|
||||
template <typename Torus>
|
||||
__host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_out, Torus *lwe_in,
|
||||
Torus *ksk,
|
||||
uint32_t lwe_dimension_before,
|
||||
uint32_t lwe_dimension_after,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t num_samples) {
|
||||
|
||||
constexpr int ideal_threads = 128;
|
||||
|
||||
int lwe_dim = lwe_dimension_after + 1;
|
||||
int lwe_lower, lwe_upper, cutoff;
|
||||
if (lwe_dim % ideal_threads == 0) {
|
||||
lwe_lower = lwe_dim / ideal_threads;
|
||||
lwe_upper = lwe_dim / ideal_threads;
|
||||
cutoff = 0;
|
||||
} else {
|
||||
int y =
|
||||
ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
|
||||
cutoff = ideal_threads - y;
|
||||
lwe_lower = lwe_dim / ideal_threads;
|
||||
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
|
||||
}
|
||||
|
||||
// int lwe_size_before =
|
||||
// (lwe_dimension_before + 1) * num_samples;
|
||||
int lwe_size_after =
|
||||
(lwe_dimension_after + 1) * num_samples;
|
||||
|
||||
int shared_mem =
|
||||
sizeof(Torus) * (lwe_dimension_after + 1);
|
||||
|
||||
cudaMemset(lwe_out, 0, sizeof(Torus) * lwe_size_after);
|
||||
|
||||
dim3 grid(num_samples, 1, 1);
|
||||
dim3 threads(ideal_threads, 1, 1);
|
||||
|
||||
cudaFuncSetAttribute(keyswitch<Torus>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
keyswitch<<<grid, threads, shared_mem, *stream>>>(
|
||||
lwe_out, lwe_in, ksk, lwe_dimension_before, lwe_dimension_after, base_log,
|
||||
l_gadget, lwe_lower, lwe_upper, cutoff);
|
||||
|
||||
cudaStreamSynchronize(*stream);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
261
src/polynomial/functions.cuh
Normal file
261
src/polynomial/functions.cuh
Normal file
@@ -0,0 +1,261 @@
|
||||
#ifndef GPU_POLYNOMIAL_FUNCTIONS
|
||||
#define GPU_POLYNOMIAL_FUNCTIONS
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
/*
|
||||
* function compresses decomposed buffer into half size complex buffer for fft
|
||||
*/
|
||||
template <typename T, class params>
|
||||
__device__ void real_to_complex_compressed(T *src, double2 *dst) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid].x = (double)src[2 * tid];
|
||||
dst[tid].y = (double)src[2 * tid + 1];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* copy source polynomial to specific slice of batched polynomials
|
||||
* used only in low latency version
|
||||
*/
|
||||
template <typename T, class params>
|
||||
__device__ void copy_into_ith_polynomial_low_lat(T *source, T *dst, int i) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = i * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid + begin] = source[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[params::degree / 2 + begin] = source[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* accumulates source polynomial into specific slice of batched polynomial
|
||||
* used only in low latency version
|
||||
*/
|
||||
template <typename T, class params>
|
||||
__device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = p_id * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dst[tid] += source[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[params::degree / 2] += source[params::degree / 2 + begin];
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Performs acc = acc * (X^ä + 1) if zeroAcc = false
|
||||
* Performs acc = 0 if zeroAcc
|
||||
* takes single buffer and calculates inplace.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
|
||||
uint32_t j,
|
||||
bool zeroAcc) {
|
||||
int tid = threadIdx.x;
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
if (zeroAcc) {
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
accumulator[tid] = 0;
|
||||
tid += block_size;
|
||||
}
|
||||
} else {
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < degree - j) {
|
||||
accumulator[tid] = input[tid + j];
|
||||
} else {
|
||||
accumulator[tid] = -input[tid - degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < degree - jj) {
|
||||
accumulator[tid] = -input[tid + jj];
|
||||
} else {
|
||||
accumulator[tid] = input[tid - degree + jj];
|
||||
}
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Performs result_acc = acc * (X^ä - 1) - acc
|
||||
* takes single buffer as input returns single rotated buffer
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void
|
||||
multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
constexpr int degree = block_size * elems_per_thread;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
if (j < degree) {
|
||||
if (tid < j) {
|
||||
result_acc[tid] = -acc[tid - j + degree] - acc[tid];
|
||||
} else {
|
||||
result_acc[tid] = acc[tid - j] - acc[tid];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - degree;
|
||||
if (tid < jj) {
|
||||
result_acc[tid] = acc[tid - jj + degree] - acc[tid];
|
||||
|
||||
} else {
|
||||
result_acc[tid] = -acc[tid - jj] - acc[tid];
|
||||
}
|
||||
}
|
||||
tid += block_size;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* performs a rounding to increase accuracy of the PBS
|
||||
* calculates inplace.
|
||||
*/
|
||||
template <typename T, int elems_per_thread, int block_size>
|
||||
__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
|
||||
int l_gadget) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < elems_per_thread; i++) {
|
||||
|
||||
T x_acc = rotated_acc[tid];
|
||||
T shift = sizeof(T) * 8 - l_gadget * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b_acc = (x_acc & mask) >> (shift - 1);
|
||||
T res_acc = x_acc >> shift;
|
||||
res_acc += b_acc;
|
||||
res_acc <<= shift;
|
||||
rotated_acc[tid] = res_acc;
|
||||
tid = tid + block_size;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void add_to_torus(double2 *m_values, Torus *result) {
|
||||
Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
// TODO (Beka) check if better memory access is possible
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
double v1 = m_values[tid].x;
|
||||
double v2 = m_values[tid].y;
|
||||
|
||||
double frac = v1 - floor(v1);
|
||||
frac *= mx;
|
||||
double carry = frac - floor(frac);
|
||||
double zero_point_five = 1. / 2.;
|
||||
if (carry >= zero_point_five)
|
||||
frac++;
|
||||
|
||||
|
||||
Torus V1 = typecast_double_to_torus<Torus>(frac);
|
||||
|
||||
frac = v2 - floor(v2);
|
||||
frac *= mx;
|
||||
carry = frac - floor(v2);
|
||||
if (carry >= zero_point_five)
|
||||
frac++;
|
||||
|
||||
Torus V2 = typecast_double_to_torus<Torus>(frac);
|
||||
|
||||
result[tid * 2] += V1;
|
||||
result[tid * 2 + 1] += V2;
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) {
|
||||
// Set first coefficient of the accumulator as the body of the LWE sample
|
||||
// todo(Joao): not every thread needs to set it
|
||||
// if (threadIdx.x == 0)
|
||||
lwe_out[params::degree] = accumulator[0];
|
||||
}
|
||||
|
||||
template <typename Torus, class params>
|
||||
__device__ void sample_extract_mask(Torus *lwe_out, Torus *accumulator) {
|
||||
// Set ACC = -ACC
|
||||
// accumulator.negate_inplace();
|
||||
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = -accumulator[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Reverse the accumulator
|
||||
// accumulator.reverse_inplace();
|
||||
|
||||
tid = threadIdx.x;
|
||||
Torus result[params::opt];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[i] = accumulator[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform ACC * X
|
||||
// accumulator.multiply_by_monomial_negacyclic_inplace(1);
|
||||
|
||||
tid = threadIdx.x;
|
||||
result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (1 < params::degree) {
|
||||
if (tid < 1)
|
||||
result[i] = -accumulator[tid - 1 + params::degree];
|
||||
else
|
||||
result[i] = accumulator[tid - 1];
|
||||
} else {
|
||||
uint32_t jj = 1 - (uint32_t)params::degree;
|
||||
if (tid < jj)
|
||||
result[i] = accumulator[tid - jj + params::degree];
|
||||
else
|
||||
result[i] = -accumulator[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
accumulator[tid] = result[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Copy to the mask of the LWE sample
|
||||
// accumulator.copy_into(lwe_out);
|
||||
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
lwe_out[tid] = accumulator[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
78
src/polynomial/parameters.cuh
Normal file
78
src/polynomial/parameters.cuh
Normal file
@@ -0,0 +1,78 @@
|
||||
#ifndef CNCRT_PARAMETERS_H
|
||||
#define CNCRT_PARAMETERS_H
|
||||
|
||||
constexpr int log2(int n) { return (n <= 2) ? 1 : 1 + log2(n / 2); }
|
||||
|
||||
constexpr int choose_opt(int degree) {
|
||||
if (degree <= 1024)
|
||||
return 4;
|
||||
else if (degree == 2048)
|
||||
return 8;
|
||||
else if (degree == 4096)
|
||||
return 16;
|
||||
else
|
||||
return 32;
|
||||
}
|
||||
template <class params> class HalfDegree {
|
||||
public:
|
||||
constexpr static int degree = params::degree / 2;
|
||||
constexpr static int opt = params::opt / 2;
|
||||
constexpr static int log2_degree = params::log2_degree - 1;
|
||||
constexpr static int quarter = params::quarter / 2;
|
||||
constexpr static int half = params::half / 2;
|
||||
constexpr static int three_quarters = quarter + half;
|
||||
constexpr static int warp = 32;
|
||||
constexpr static int fft_sm_required = degree + degree / warp;
|
||||
};
|
||||
|
||||
template <int N> class Degree {
|
||||
public:
|
||||
constexpr static int degree = N;
|
||||
constexpr static int opt = choose_opt(N);
|
||||
constexpr static int log2_degree = log2(N);
|
||||
constexpr static int quarter = N / 4;
|
||||
constexpr static int half = N / 2;
|
||||
constexpr static int three_quarters = half + quarter;
|
||||
constexpr static int warp = 32;
|
||||
constexpr static int fft_sm_required = N + 32;
|
||||
};
|
||||
|
||||
enum sharedMemDegree {
|
||||
NOSM = 0,
|
||||
PARTIALSM = 1,
|
||||
FULLSM = 2
|
||||
|
||||
};
|
||||
|
||||
class ForwardFFT {
|
||||
public:
|
||||
constexpr static int direction = 0;
|
||||
};
|
||||
|
||||
class BackwardFFT {
|
||||
public:
|
||||
constexpr static int direction = 1;
|
||||
};
|
||||
|
||||
class ReorderFFT {
|
||||
constexpr static int reorder = 1;
|
||||
};
|
||||
class NoReorderFFT {
|
||||
constexpr static int reorder = 0;
|
||||
};
|
||||
|
||||
template <class params, class direction, class reorder = ReorderFFT>
|
||||
class FFTDegree : public params {
|
||||
public:
|
||||
constexpr static int fft_direction = direction::direction;
|
||||
constexpr static int fft_reorder = reorder::reorder;
|
||||
};
|
||||
|
||||
template <int N, class direction, class reorder = ReorderFFT>
|
||||
class FFTParams : public Degree<N> {
|
||||
public:
|
||||
constexpr static int fft_direction = direction::direction;
|
||||
constexpr static int fft_reorder = reorder::reorder;
|
||||
};
|
||||
|
||||
#endif // CNCRT_PARAMETERS_H
|
||||
668
src/polynomial/polynomial.cuh
Normal file
668
src/polynomial/polynomial.cuh
Normal file
@@ -0,0 +1,668 @@
|
||||
#ifndef CNCRT_POLYNOMIAL_H
|
||||
#define CNCRT_POLYNOMIAL_H
|
||||
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "parameters.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
#include <cassert>
|
||||
#include <cstdint>
|
||||
|
||||
#define PI 3.141592653589793238462643383279502884197
|
||||
|
||||
template <typename T>
|
||||
__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &data[pos];
|
||||
return ptr;
|
||||
}
|
||||
|
||||
class ExtraMemory {
|
||||
public:
|
||||
uint32_t m_size;
|
||||
__device__ ExtraMemory(uint32_t size) : m_size(size) {}
|
||||
};
|
||||
template <typename T, class params> class PolynomialFourier;
|
||||
|
||||
template <typename T, class params> class Polynomial;
|
||||
|
||||
template <typename T, class params> class Vector;
|
||||
|
||||
template <typename FT, class params> class Twiddles;
|
||||
|
||||
template <typename T, class params> class VectorPolynomial {
|
||||
public:
|
||||
T *m_data;
|
||||
uint32_t m_num_polynomials;
|
||||
|
||||
__device__ VectorPolynomial(T *data, uint32_t num_polynomials)
|
||||
: m_data(data), m_num_polynomials(num_polynomials) {}
|
||||
|
||||
__device__ VectorPolynomial(SharedMemory &shmem, uint32_t num_polynomials)
|
||||
: m_num_polynomials(num_polynomials) {
|
||||
shmem.get_allocation(&m_data, m_num_polynomials * params::degree);
|
||||
}
|
||||
|
||||
__device__ VectorPolynomial<T, params> get_chunk(int chunk_num,
|
||||
int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &m_data[pos];
|
||||
// todo(Joao): unsafe, user must pass chunk that has size multiple of
|
||||
// polynomial degree
|
||||
return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
|
||||
}
|
||||
|
||||
__host__ VectorPolynomial() {}
|
||||
|
||||
__host__ VectorPolynomial(DeviceMemory &dmem, uint32_t num_polynomials,
|
||||
int device)
|
||||
: m_num_polynomials(num_polynomials) {
|
||||
dmem.get_allocation(&m_data, m_num_polynomials * params::degree, device);
|
||||
}
|
||||
|
||||
__host__ VectorPolynomial(DeviceMemory &dmem, T *source,
|
||||
uint32_t num_polynomials, int device)
|
||||
: m_num_polynomials(num_polynomials) {
|
||||
dmem.get_allocation_and_copy_async(
|
||||
&m_data, source, m_num_polynomials * params::degree, device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, m_data,
|
||||
sizeof(T) * m_num_polynomials * params::degree,
|
||||
cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__device__ void copy_into(Polynomial<T, params> &dest,
|
||||
int polynomial_number = 0) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = polynomial_number * params::degree;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
dest.coefficients[tid] = m_data[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
// todo(Joao): we need to make these APIs more clear, as it's confusing what's
|
||||
// being copied where
|
||||
__device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
|
||||
int i) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = i * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_data[tid + begin] = source.m_values[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_data[params::degree / 2 + begin] =
|
||||
source.m_values[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void split_into_polynomials(Polynomial<T, params> &first,
|
||||
Polynomial<T, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
first.coefficients[tid] = m_data[tid];
|
||||
second.coefficients[tid] = m_data[tid + params::degree];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, class params> class PolynomialFourier {
|
||||
public:
|
||||
T *m_values;
|
||||
uint32_t degree;
|
||||
|
||||
__device__ __host__ PolynomialFourier(T *m_values) : m_values(m_values) {}
|
||||
|
||||
__device__ PolynomialFourier(SharedMemory &shmem) : degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, params::degree);
|
||||
}
|
||||
|
||||
__device__ PolynomialFourier(SharedMemory &shmem, ExtraMemory extra_memory)
|
||||
: degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, params::degree + extra_memory.m_size);
|
||||
}
|
||||
__device__ PolynomialFourier(SharedMemory &shmem, uint32_t degree)
|
||||
: degree(degree) {
|
||||
shmem.get_allocation(&this->m_values, degree);
|
||||
}
|
||||
|
||||
__host__ PolynomialFourier(DeviceMemory &dmem, int device) : degree(degree) {
|
||||
dmem.get_allocation(&this->m_values, params::degree, device);
|
||||
}
|
||||
|
||||
__device__ char *reuse_memory() { return (char *)m_values; }
|
||||
__device__ void copy_from(PolynomialFourier<T, params> &source, int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
this->m_values[tid + begin] = source.m_values[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
__device__ void fill_with(T value) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
m_values[tid] = value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ void add_polynomial_inplace(PolynomialFourier<T, params> &source,
|
||||
int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_values[tid] += source.m_values[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_values[params::degree / 2] += source.m_values[params::degree / 2 +
|
||||
begin];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
__device__ void swap_quarters_inplace() {
|
||||
int tid = threadIdx.x;
|
||||
int s1 = params::quarter;
|
||||
int s2 = params::three_quarters;
|
||||
|
||||
T tmp = m_values[s2 + tid];
|
||||
m_values[s2 + tid] = m_values[s1 + tid];
|
||||
m_values[s1 + tid] = tmp;
|
||||
}
|
||||
|
||||
__device__ void add_polynomial_inplace(VectorPolynomial<T, params> &source,
|
||||
int polynomial_number) {
|
||||
int tid = threadIdx.x;
|
||||
int begin = polynomial_number * (params::degree / 2 + 1);
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
this->m_values[tid] += source.m_data[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
this->m_values[params::degree / 2] +=
|
||||
source.m_data[params::degree / 2 + begin];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void
|
||||
forward_negacyclic_fft_inplace(PolynomialFourier<double2, params> &X) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
__device__ void inverse_negacyclic_fft_inplace() {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
template <typename Torus>
|
||||
__device__ void add_to_torus(Polynomial<Torus, params> &result) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
|
||||
__device__ T &operator[](int i) { return m_values[i]; }
|
||||
};
|
||||
|
||||
template <typename T, class params> class Polynomial {
|
||||
public:
|
||||
T *coefficients;
|
||||
uint32_t degree;
|
||||
|
||||
__device__ Polynomial(T *coefficients, uint32_t degree)
|
||||
: coefficients(coefficients), degree(degree) {}
|
||||
|
||||
__device__ Polynomial(char *memory, uint32_t degree)
|
||||
: coefficients((T *)memory), degree(degree) {}
|
||||
|
||||
__device__ Polynomial(SharedMemory &shmem, uint32_t degree) : degree(degree) {
|
||||
shmem.get_allocation(&this->coefficients, degree);
|
||||
}
|
||||
|
||||
__host__ Polynomial(DeviceMemory &dmem, uint32_t degree, int device)
|
||||
: degree(degree) {
|
||||
dmem.get_allocation(&this->coefficients, params::degree, device);
|
||||
}
|
||||
|
||||
__host__ Polynomial(DeviceMemory &dmem, T *source, uint32_t degree,
|
||||
int device)
|
||||
: degree(degree) {
|
||||
dmem.get_allocation_and_copy_async(&this->coefficients, source,
|
||||
params::degree, device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree,
|
||||
cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__device__ T get_coefficient(int i) { return this->coefficients[i]; }
|
||||
|
||||
__device__ char *reuse_memory() { return (char *)coefficients; }
|
||||
|
||||
__device__ void copy_coefficients_from(Polynomial<T, params> &source,
|
||||
int begin_dest = 0,
|
||||
int begin_src = 0) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
this->coefficients[tid + begin_dest] = source.coefficients[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void fill_with(T value) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = value;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic(Polynomial<T, params> &result,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - j + params::degree];
|
||||
else
|
||||
result.coefficients[tid] = this->coefficients[tid - j];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - jj + params::degree];
|
||||
else
|
||||
result.coefficients[tid] = -this->coefficients[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic_inplace(uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
T result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result[i] = -this->coefficients[tid - j + params::degree];
|
||||
else
|
||||
result[i] = this->coefficients[tid - j];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result[i] = this->coefficients[tid - jj + params::degree];
|
||||
else
|
||||
result[i] = -this->coefficients[tid - jj];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = result[i];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
|
||||
Polynomial<T, params> &result, uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < j)
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - j + params::degree] -
|
||||
this->coefficients[tid];
|
||||
else
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - j] - this->coefficients[tid];
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < jj)
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - jj + params::degree] -
|
||||
this->coefficients[tid];
|
||||
else
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - jj] - this->coefficients[tid];
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void divide_by_monomial_negacyclic(Polynomial<T, params> &result,
|
||||
uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < params::degree - j) {
|
||||
result.coefficients[tid] = this->coefficients[tid + j];
|
||||
} else {
|
||||
result.coefficients[tid] =
|
||||
-this->coefficients[tid - params::degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < params::degree - jj) {
|
||||
result.coefficients[tid] = -this->coefficients[tid + jj];
|
||||
} else {
|
||||
result.coefficients[tid] =
|
||||
this->coefficients[tid - params::degree + jj];
|
||||
}
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void divide_by_monomial_negacyclic_inplace(uint32_t j) {
|
||||
int tid = threadIdx.x;
|
||||
T result[params::opt];
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
if (j < params::degree) {
|
||||
if (tid < params::degree - j) {
|
||||
result[i] = this->coefficients[tid + j];
|
||||
} else {
|
||||
result[i] = -this->coefficients[tid - params::degree + j];
|
||||
}
|
||||
} else {
|
||||
uint32_t jj = j - params::degree;
|
||||
if (tid < params::degree - jj) {
|
||||
result[i] = -this->coefficients[tid + jj];
|
||||
} else {
|
||||
result[i] = this->coefficients[tid - params::degree + jj];
|
||||
}
|
||||
}
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void round_to_closest_multiple_inplace(uint32_t base_log,
|
||||
uint32_t l_gadget) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
|
||||
T x = coefficients[tid];
|
||||
T shift = sizeof(T) * 8 - l_gadget * base_log;
|
||||
T mask = 1ll << (shift - 1);
|
||||
T b = (x & mask) >> (shift - 1);
|
||||
T res = x >> shift;
|
||||
res += b;
|
||||
res <<= shift;
|
||||
coefficients[tid] = res;
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void
|
||||
to_complex_compressed(PolynomialFourier<double2, params> &dest) {
|
||||
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
dest.m_values[tid].x = (double)coefficients[2 * tid];
|
||||
dest.m_values[tid].y = (double)coefficients[2 * tid + 1];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void to_complex(PolynomialFourier<double2, params> &dest) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
dest.m_values[tid].x = (double)coefficients[tid];
|
||||
dest.m_values[tid].y = 0.0;
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void multiply_by_scalar_inplace(T scalar) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
const int slices = params::degree / grid_dim;
|
||||
const int jump = grid_dim;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
this->coefficients[tid] *= scalar;
|
||||
tid += jump;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void add_scalar_inplace(T scalar) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
const int slices = params::degree / grid_dim;
|
||||
const int jump = grid_dim;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
this->coefficients[tid] += scalar;
|
||||
tid += jump;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void sub_scalar_inplace(T scalar) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
const int slices = params::degree / grid_dim;
|
||||
const int jump = grid_dim;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
this->coefficients[tid] -= scalar;
|
||||
tid += jump;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ void add_polynomial_inplace(Polynomial<T, params> &source,
|
||||
int begin) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
this->coefficients[tid] += source.coefficients[tid + begin];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
__device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
|
||||
int tid = threadIdx.x;
|
||||
const int grid_dim = blockDim.x;
|
||||
const int slices = params::degree / grid_dim;
|
||||
const int jump = grid_dim;
|
||||
for (int i = 0; i < slices; i++) {
|
||||
this->coefficients[tid] -= rhs.coefficients[tid];
|
||||
tid += jump;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void negate_inplace() {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = -coefficients[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
__device__ void copy_into(Vector<T, params> &vec) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
vec.m_data[tid] = coefficients[tid];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void copy_reversed_into(Vector<T, params> &vec) {
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
vec.m_data[tid] = coefficients[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void reverse_inplace() {
|
||||
int tid = threadIdx.x;
|
||||
T result[params::opt];
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[i] = coefficients[params::degree - tid - 1];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
coefficients[tid] = result[i];
|
||||
tid = tid + params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
template <typename FT>
|
||||
__device__ void
|
||||
forward_negacyclic_fft_half(PolynomialFourier<FT, params> &result) {
|
||||
// TODO function should be removed
|
||||
}
|
||||
};
|
||||
template <typename T, class params> class Vector {
|
||||
public:
|
||||
T *m_data;
|
||||
uint32_t m_size;
|
||||
|
||||
__device__ Vector(T *elements, uint32_t size)
|
||||
: m_data(elements), m_size(size) {}
|
||||
|
||||
template <typename V>
|
||||
__device__ Vector(SharedMemory &shmem, V src, int size) : m_size(size) {
|
||||
shmem.get_allocation(&m_data, m_size);
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt && tid < m_size; i++) {
|
||||
if (tid > m_size)
|
||||
continue;
|
||||
m_data[tid] = src[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ Vector(SharedMemory &shmem, uint32_t size) : m_size(size) {
|
||||
shmem.get_allocation(&m_data, m_size);
|
||||
}
|
||||
|
||||
__host__ Vector() {}
|
||||
|
||||
__host__ Vector(DeviceMemory &dmem, uint32_t size, int device)
|
||||
: m_size(size) {
|
||||
dmem.get_allocation(&m_data, m_size, device);
|
||||
}
|
||||
|
||||
__device__ T &operator[](int i) { return m_data[i]; }
|
||||
|
||||
__device__ Vector<T, params> get_chunk(int chunk_num, int chunk_size) {
|
||||
int pos = chunk_num * chunk_size;
|
||||
T *ptr = &m_data[pos];
|
||||
return Vector<T, params>(ptr, chunk_size);
|
||||
}
|
||||
|
||||
__host__ void copy_to_device(T *source, uint32_t elements) {
|
||||
cudaMemcpyAsync(m_data, source, sizeof(T) * elements,
|
||||
cudaMemcpyHostToDevice);
|
||||
}
|
||||
|
||||
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t size_source,
|
||||
int device)
|
||||
: m_size(size_source) {
|
||||
dmem.get_allocation_and_copy_async(&m_data, source, m_size, device);
|
||||
}
|
||||
|
||||
__host__ Vector(DeviceMemory &dmem, T *source, uint32_t allocation_size,
|
||||
uint32_t copy_size, int device)
|
||||
: m_size(allocation_size) {
|
||||
if (copy_size > allocation_size) {
|
||||
printf("warning: copying more than allocation");
|
||||
}
|
||||
dmem.get_allocation_and_copy_async(&m_data, source, m_size, copy_size,
|
||||
device);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest) {
|
||||
cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__host__ void copy_to_host(T *dest, int elements) {
|
||||
cudaMemcpyAsync(dest, m_data, sizeof(T) * elements, cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
||||
__device__ T get_ith_element(int i) { return m_data[i]; }
|
||||
|
||||
__device__ T get_last_element() { return m_data[m_size - 1]; }
|
||||
|
||||
__device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
|
||||
|
||||
// todo(Joao): let's do coalesced access here at some point
|
||||
__device__ void operator-=(const Vector<T, params> &rhs) {
|
||||
assert(m_size == rhs->m_size);
|
||||
int tid = threadIdx.x;
|
||||
int pos = tid;
|
||||
int total = m_size / blockDim.x + 1;
|
||||
for (int i = 0; i < total; i++) {
|
||||
if (pos < m_size)
|
||||
m_data[pos] -= rhs.m_data[pos];
|
||||
pos += blockDim.x;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void operator*=(const T &rhs) {
|
||||
int tid = threadIdx.x;
|
||||
int pos = tid;
|
||||
int total = m_size / blockDim.x + 1;
|
||||
for (int i = 0; i < total; i++) {
|
||||
if (pos < m_size)
|
||||
m_data[pos] *= rhs;
|
||||
pos += blockDim.x;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <typename FT, class params> class Twiddles {
|
||||
public:
|
||||
Vector<FT, params> twiddles2, twiddles3, twiddles4, twiddles5, twiddles6,
|
||||
twiddles7, twiddles8, twiddles9, twiddles10;
|
||||
|
||||
__device__
|
||||
Twiddles(Vector<FT, params> &twiddles2, Vector<FT, params> &twiddles3,
|
||||
Vector<FT, params> &twiddles4, Vector<FT, params> &twiddles5,
|
||||
Vector<FT, params> &twiddles6, Vector<FT, params> &twiddles7,
|
||||
Vector<FT, params> &twiddles8, Vector<FT, params> &twiddles9,
|
||||
Vector<FT, params> &twiddles10)
|
||||
: twiddles2(twiddles2), twiddles3(twiddles3), twiddles4(twiddles4),
|
||||
twiddles5(twiddles5), twiddles6(twiddles6), twiddles7(twiddles7),
|
||||
twiddles8(twiddles8), twiddles9(twiddles9), twiddles10(twiddles10) {}
|
||||
};
|
||||
|
||||
#endif // CNCRT_POLYNOMIAL_H
|
||||
65
src/polynomial/polynomial_math.cuh
Normal file
65
src/polynomial/polynomial_math.cuh
Normal file
@@ -0,0 +1,65 @@
|
||||
#ifndef CNCRT_POLYNOMIAL_MATH_H
|
||||
#define CNCRT_POLYNOMIAL_MATH_H
|
||||
|
||||
#include "crypto/torus.cuh"
|
||||
#include "parameters.cuh"
|
||||
#include "polynomial.cuh"
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
|
||||
FT *second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] =
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_in_fourier_domain(
|
||||
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
|
||||
PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] = first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] =
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
|
||||
PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
result[params::degree / 2] +=
|
||||
first[params::degree / 2] * second[params::degree / 2];
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
FT *result, FT *first, PolynomialFourier<FT, params> &second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second.m_values[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CNCRT_POLYNOMIAL_MATH_H
|
||||
1
src/polynomial/polynomial_twiddles.cuh
Normal file
1
src/polynomial/polynomial_twiddles.cuh
Normal file
@@ -0,0 +1 @@
|
||||
#include "polynomial.cuh"
|
||||
76
src/types/int128.cuh
Normal file
76
src/types/int128.cuh
Normal file
@@ -0,0 +1,76 @@
|
||||
#ifndef CNCRT_INT128_H
|
||||
#define CNCRT_INT128_H
|
||||
|
||||
// abseil's int128 type
|
||||
// licensed under Apache license
|
||||
|
||||
class uint128 {
|
||||
public:
|
||||
__device__ uint128(uint64_t high, uint64_t low) : hi_(high), lo_(low) {}
|
||||
|
||||
uint64_t lo_;
|
||||
uint64_t hi_;
|
||||
};
|
||||
|
||||
class int128 {
|
||||
public:
|
||||
int128() = default;
|
||||
|
||||
__device__ operator unsigned long long() const {
|
||||
return static_cast<unsigned long long>(lo_);
|
||||
}
|
||||
|
||||
__device__ int128(int64_t high, uint64_t low) : hi_(high), lo_(low) {}
|
||||
|
||||
uint64_t lo_;
|
||||
int64_t hi_;
|
||||
};
|
||||
|
||||
__device__ inline uint128 make_uint128(uint64_t high, uint64_t low) {
|
||||
return uint128(high, low);
|
||||
}
|
||||
|
||||
template <typename T> __device__ uint128 make_uint128_from_float(T v) {
|
||||
if (v >= ldexp(static_cast<T>(1), 64)) {
|
||||
uint64_t hi = static_cast<uint64_t>(ldexp(v, -64));
|
||||
uint64_t lo = static_cast<uint64_t>(v - ldexp(static_cast<T>(hi), 64));
|
||||
return make_uint128(hi, lo);
|
||||
}
|
||||
|
||||
return make_uint128(0, static_cast<uint64_t>(v));
|
||||
}
|
||||
|
||||
__device__ inline int128 make_int128(int64_t high, uint64_t low) {
|
||||
return int128(high, low);
|
||||
}
|
||||
|
||||
__device__ inline int64_t bitcast_to_signed(uint64_t v) {
|
||||
return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
|
||||
: static_cast<int64_t>(v);
|
||||
}
|
||||
|
||||
__device__ inline uint64_t uint128_high64(uint128 v) { return v.hi_; }
|
||||
__device__ inline uint64_t uint128_low64(uint128 v) { return v.lo_; }
|
||||
|
||||
__device__ __forceinline__ uint128 operator-(uint128 val) {
|
||||
uint64_t hi = ~uint128_high64(val);
|
||||
uint64_t lo = ~uint128_low64(val) + 1;
|
||||
if (lo == 0)
|
||||
++hi; // carry
|
||||
return make_uint128(hi, lo);
|
||||
}
|
||||
|
||||
template <typename T> __device__ int128 make_int128_from_float(T v) {
|
||||
|
||||
// We must convert the absolute value and then negate as needed, because
|
||||
// floating point types are typically sign-magnitude. Otherwise, the
|
||||
// difference between the high and low 64 bits when interpreted as two's
|
||||
// complement overwhelms the precision of the mantissa.
|
||||
uint128 result =
|
||||
v < 0 ? -make_uint128_from_float(-v) : make_uint128_from_float(v);
|
||||
|
||||
return make_int128(bitcast_to_signed(uint128_high64(result)),
|
||||
uint128_low64(result));
|
||||
}
|
||||
|
||||
#endif
|
||||
15
src/utils/kernel_dimensions.cuh
Normal file
15
src/utils/kernel_dimensions.cuh
Normal file
@@ -0,0 +1,15 @@
|
||||
int nextPow2(int x) {
|
||||
--x;
|
||||
x |= x >> 1;
|
||||
x |= x >> 2;
|
||||
x |= x >> 4;
|
||||
x |= x >> 8;
|
||||
x |= x >> 16;
|
||||
return ++x;
|
||||
}
|
||||
|
||||
void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks,
|
||||
int &threads) {
|
||||
threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize;
|
||||
blocks = (n + threads - 1) / threads;
|
||||
}
|
||||
90
src/utils/memory.cuh
Normal file
90
src/utils/memory.cuh
Normal file
@@ -0,0 +1,90 @@
|
||||
#ifndef CNCRT_SHMEM_H
|
||||
#define CNCRT_SHMEM_H
|
||||
|
||||
#include "helper_cuda.h"
|
||||
#include <atomic>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <tuple>
|
||||
#include <vector>
|
||||
|
||||
class SharedMemory {
|
||||
public:
|
||||
char *m_memory_block;
|
||||
int m_last_byte;
|
||||
|
||||
__device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
|
||||
|
||||
template <typename T> __device__ void get_allocation(T **ptr, int elements) {
|
||||
*ptr = (T *)(&this->m_memory_block[m_last_byte]);
|
||||
this->m_last_byte += elements * sizeof(T);
|
||||
}
|
||||
};
|
||||
|
||||
class DeviceMemory {
|
||||
public:
|
||||
std::vector<std::tuple<void *, int>> m_allocated;
|
||||
std::mutex m_allocation_mtx;
|
||||
std::atomic<uint32_t> m_total_devices;
|
||||
|
||||
DeviceMemory() : m_total_devices(1) {}
|
||||
|
||||
__host__ void set_device(int device) {
|
||||
if (device > m_total_devices)
|
||||
m_total_devices = device + 1;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation(T **ptr, int elements, int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * elements);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
|
||||
int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * elements);
|
||||
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
|
||||
int elements, int device) {
|
||||
T *res;
|
||||
cudaMalloc((void **)&res, sizeof(T) * allocation);
|
||||
cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
|
||||
*ptr = res;
|
||||
std::lock_guard<std::mutex> lock(m_allocation_mtx);
|
||||
m_allocated.push_back(std::make_tuple(res, device));
|
||||
}
|
||||
|
||||
void free_all_from_device(int device) {
|
||||
cudaSetDevice(device);
|
||||
for (auto elem : m_allocated) {
|
||||
auto dev = std::get<1>(elem);
|
||||
if (dev == device) {
|
||||
auto mem = std::get<0>(elem);
|
||||
checkCudaErrors(cudaFree(mem));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__host__ ~DeviceMemory() {
|
||||
for (auto elem : m_allocated) {
|
||||
auto dev = std::get<1>(elem);
|
||||
auto mem = std::get<0>(elem);
|
||||
cudaSetDevice(dev);
|
||||
checkCudaErrors(cudaFree(mem));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_SHMEM_H
|
||||
29
src/utils/timer.cuh
Normal file
29
src/utils/timer.cuh
Normal file
@@ -0,0 +1,29 @@
|
||||
#ifndef CNCRT_TIMER_H
|
||||
#define CNCRT_TIMER_H
|
||||
|
||||
#define synchronize_threads_in_block() __syncthreads()
|
||||
|
||||
template <bool active> class CudaMeasureExecution {
|
||||
public:
|
||||
cudaEvent_t m_start, m_stop;
|
||||
|
||||
__host__ CudaMeasureExecution() {
|
||||
if constexpr (active) {
|
||||
cudaEventCreate(&m_start);
|
||||
cudaEventCreate(&m_stop);
|
||||
cudaEventRecord(m_start);
|
||||
}
|
||||
}
|
||||
|
||||
__host__ ~CudaMeasureExecution() {
|
||||
if constexpr (active) {
|
||||
float ms;
|
||||
cudaEventRecord(m_stop);
|
||||
cudaEventSynchronize(m_stop);
|
||||
cudaEventElapsedTime(&ms, m_start, m_stop);
|
||||
std::cout << "Execution took " << ms << "ms" << std::endl;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#endif // CNCRT_TIMER_H
|
||||
Reference in New Issue
Block a user