mirror of
https://github.com/zama-ai/concrete.git
synced 2026-04-17 03:00:54 -04:00
feat(cuda): Implements the CMUX Tree on CUDA backend.
This commit is contained in:
@@ -78,17 +78,44 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
|
||||
uint32_t num_test_vectors,
|
||||
uint32_t lwe_idx,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_cmux_tree_32(
|
||||
void *v_stream,
|
||||
void *glwe_out,
|
||||
void *ggsw_in,
|
||||
void *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory);
|
||||
|
||||
void cuda_cmux_tree_64(
|
||||
void *v_stream,
|
||||
void *glwe_out,
|
||||
void *ggsw_in,
|
||||
void *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory);
|
||||
};
|
||||
|
||||
|
||||
__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
int glwe_dimension,
|
||||
uint32_t l_gadget);
|
||||
|
||||
__device__ double2*
|
||||
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
template <typename T>
|
||||
__device__ T*
|
||||
get_ith_mask_kth_block(T* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
|
||||
__device__ double2*
|
||||
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
template <typename T>
|
||||
__device__ T*
|
||||
get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
#endif // CUDA_BOOTSTRAP_H
|
||||
|
||||
103
src/bootstrap_wop.cu
Normal file
103
src/bootstrap_wop.cu
Normal file
@@ -0,0 +1,103 @@
|
||||
#include "bootstrap_wop.cuh"
|
||||
|
||||
void cuda_cmux_tree_32(
|
||||
void *v_stream,
|
||||
void *glwe_out,
|
||||
void *ggsw_in,
|
||||
void *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<512>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<1024>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<2048>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<4096>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_cmux_tree<uint32_t, int32_t, Degree<8192>>(
|
||||
v_stream,
|
||||
(uint32_t *) glwe_out, (uint32_t *) ggsw_in, (uint32_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void cuda_cmux_tree_64(
|
||||
void *v_stream,
|
||||
void *glwe_out,
|
||||
void *ggsw_in,
|
||||
void *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
switch (polynomial_size) {
|
||||
case 512:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<512>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 1024:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<1024>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 2048:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<2048>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 4096:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<4096>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
case 8192:
|
||||
host_cmux_tree<uint64_t, int64_t, Degree<8192>>(
|
||||
v_stream,
|
||||
(uint64_t *) glwe_out, (uint64_t *) ggsw_in,(uint64_t *) lut_vector,
|
||||
glwe_dimension, polynomial_size, base_log, l_gadget, r,
|
||||
max_shared_memory);
|
||||
break;
|
||||
}
|
||||
}
|
||||
402
src/bootstrap_wop.cuh
Normal file
402
src/bootstrap_wop.cuh
Normal file
@@ -0,0 +1,402 @@
|
||||
#ifndef WOP_PBS_H
|
||||
#define WOP_PBS_H
|
||||
|
||||
#include "cooperative_groups.h"
|
||||
|
||||
#include "../include/helper_cuda.h"
|
||||
#include "bootstrap.h"
|
||||
#include "complex/operations.cuh"
|
||||
#include "crypto/gadget.cuh"
|
||||
#include "crypto/torus.cuh"
|
||||
#include "crypto/ggsw.cuh"
|
||||
#include "fft/bnsmfft.cuh"
|
||||
#include "fft/smfft.cuh"
|
||||
#include "fft/twiddles.cuh"
|
||||
#include "polynomial/functions.cuh"
|
||||
#include "polynomial/parameters.cuh"
|
||||
#include "polynomial/polynomial.cuh"
|
||||
#include "polynomial/polynomial_math.cuh"
|
||||
#include "utils/memory.cuh"
|
||||
#include "utils/timer.cuh"
|
||||
|
||||
template <typename T, class params>
|
||||
__device__ void fft(double2 *output, T *input){
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Reduce the size of the FFT to be performed by storing
|
||||
// the real-valued polynomial into a complex polynomial
|
||||
real_to_complex_compressed<T, params>(input, output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(output);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
template <typename T, typename ST, class params>
|
||||
__device__ void fft(double2 *output, T *input){
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Reduce the size of the FFT to be performed by storing
|
||||
// the real-valued polynomial into a complex polynomial
|
||||
real_to_complex_compressed<T, ST, params>(input, output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(output);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
template <class params>
|
||||
__device__ void ifft_inplace(double2 *data){
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_inverse_fft_inplace<params>(data);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
NSMFFT_inverse<HalfDegree<params>>(data);
|
||||
synchronize_threads_in_block();
|
||||
}
|
||||
|
||||
/*
|
||||
* Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this array,
|
||||
* and an array of GGSW ciphertexts with a index to one ciphertext in it. Compute a CMUX with these
|
||||
* operands and writes the output to a particular index of glwe_out.
|
||||
*
|
||||
* This function needs polynomial_size threads per block.
|
||||
*
|
||||
* - glwe_out: An array where the result should be written to.
|
||||
* - glwe_in: An array where the GLWE inputs are stored.
|
||||
* - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
|
||||
* - selected_memory: An array to be used for the accumulators. Can be in the shared memory or
|
||||
* global memory.
|
||||
* - output_idx: The index of the output where the glwe ciphertext should be written.
|
||||
* - input_idx1: The index of the first glwe ciphertext we will use.
|
||||
* - input_idx2: The index of the second glwe ciphertext we will use.
|
||||
* - glwe_dim: This is k.
|
||||
* - polynomial_size: size of the polynomials. This is N.
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - ggsw_idx: The index of the GGSW we will use.
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
__device__ void cmux(
|
||||
Torus *glwe_out, Torus* glwe_in, double2 *ggsw_in, char *selected_memory,
|
||||
uint32_t output_idx, uint32_t input_idx1, uint32_t input_idx2,
|
||||
uint32_t glwe_dim, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t ggsw_idx){
|
||||
|
||||
// Define glwe_sub
|
||||
Torus *glwe_sub_mask = (Torus *) selected_memory;
|
||||
Torus *glwe_sub_body = (Torus *) glwe_sub_mask + (ptrdiff_t)polynomial_size;
|
||||
|
||||
int16_t *glwe_mask_decomposed = (int16_t *)(glwe_sub_body + polynomial_size);
|
||||
int16_t *glwe_body_decomposed =
|
||||
(int16_t *)glwe_mask_decomposed + (ptrdiff_t)polynomial_size;
|
||||
|
||||
double2 *mask_res_fft = (double2 *)(glwe_body_decomposed +
|
||||
polynomial_size);
|
||||
double2 *body_res_fft =
|
||||
(double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
|
||||
|
||||
double2 *glwe_fft =
|
||||
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
|
||||
|
||||
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
|
||||
|
||||
/////////////////////////////////////
|
||||
|
||||
// glwe2-glwe1
|
||||
|
||||
// Copy m0 to shared memory to preserve data
|
||||
auto m0_mask = &glwe_in[input_idx1 * (glwe_dim + 1) * polynomial_size];
|
||||
auto m0_body = m0_mask + polynomial_size;
|
||||
|
||||
// Just gets the pointer for m1 on global memory
|
||||
auto m1_mask = &glwe_in[input_idx2 * (glwe_dim + 1) * polynomial_size];
|
||||
auto m1_body = m1_mask + polynomial_size;
|
||||
|
||||
// Mask
|
||||
sub_polynomial<Torus, params>(
|
||||
glwe_sub_mask, m1_mask, m0_mask
|
||||
);
|
||||
// Body
|
||||
sub_polynomial<Torus, params>(
|
||||
glwe_sub_body, m1_body, m0_body
|
||||
);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Initialize the polynomial multiplication via FFT arrays
|
||||
// The polynomial multiplications happens at the block level
|
||||
// and each thread handles two or more coefficients
|
||||
int pos = threadIdx.x;
|
||||
for (int j = 0; j < params::opt / 2; j++) {
|
||||
mask_res_fft[pos].x = 0;
|
||||
mask_res_fft[pos].y = 0;
|
||||
body_res_fft[pos].x = 0;
|
||||
body_res_fft[pos].y = 0;
|
||||
pos += params::degree / params::opt;
|
||||
}
|
||||
|
||||
// Subtract each glwe operand, decompose the resulting
|
||||
// polynomial coefficients to multiply each decomposed level
|
||||
// with the corresponding part of the LUT
|
||||
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
|
||||
|
||||
// Decomposition
|
||||
gadget.decompose_one_level(glwe_mask_decomposed,
|
||||
glwe_sub_mask,
|
||||
decomp_level);
|
||||
gadget.decompose_one_level(glwe_body_decomposed,
|
||||
glwe_sub_body,
|
||||
decomp_level);
|
||||
|
||||
// First, perform the polynomial multiplication for the mask
|
||||
synchronize_threads_in_block();
|
||||
fft<int16_t, params>(glwe_fft, glwe_mask_decomposed);
|
||||
|
||||
// External product and accumulate
|
||||
// Get the piece necessary for the multiplication
|
||||
auto mask_fourier = get_ith_mask_kth_block(
|
||||
ggsw_in, ggsw_idx, 0, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
auto body_fourier = get_ith_body_kth_block(
|
||||
ggsw_in, ggsw_idx, 0, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Perform the coefficient-wise product
|
||||
synchronize_threads_in_block();
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, glwe_fft, mask_fourier);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, glwe_fft, body_fourier);
|
||||
|
||||
// Now handle the polynomial multiplication for the body
|
||||
// in the same way
|
||||
synchronize_threads_in_block();
|
||||
fft<int16_t, params>(glwe_fft, glwe_body_decomposed);
|
||||
|
||||
// External product and accumulate
|
||||
// Get the piece necessary for the multiplication
|
||||
mask_fourier = get_ith_mask_kth_block(
|
||||
ggsw_in, ggsw_idx, 1, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
body_fourier = get_ith_body_kth_block(
|
||||
ggsw_in, ggsw_idx, 1, decomp_level,
|
||||
polynomial_size, glwe_dim, l_gadget);
|
||||
|
||||
synchronize_threads_in_block();
|
||||
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
mask_res_fft, glwe_fft, mask_fourier);
|
||||
polynomial_product_accumulate_in_fourier_domain<params, double2>(
|
||||
body_res_fft, glwe_fft, body_fourier);
|
||||
|
||||
}
|
||||
|
||||
// IFFT
|
||||
synchronize_threads_in_block();
|
||||
ifft_inplace<params>(mask_res_fft);
|
||||
ifft_inplace<params>(body_res_fft);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Write the output
|
||||
Torus *mb_mask = &glwe_out[output_idx * (glwe_dim + 1) * polynomial_size];
|
||||
Torus *mb_body = mb_mask + polynomial_size;
|
||||
|
||||
int tid = threadIdx.x;
|
||||
for(int i = 0; i < params::opt; i++){
|
||||
mb_mask[tid] = m0_mask[tid];
|
||||
mb_body[tid] = m0_body[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
|
||||
add_to_torus<Torus, params>(mask_res_fft, mb_mask);
|
||||
add_to_torus<Torus, params>(body_res_fft, mb_body);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW ciphertext.
|
||||
* The GLWE ciphertexts are picked two-by-two in sequence. Each thread block computes a single CMUX.
|
||||
*
|
||||
* - glwe_out: An array where the result should be written to.
|
||||
* - glwe_in: An array where the GLWE inputs are stored.
|
||||
* - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
|
||||
* - device_mem: An pointer for the global memory in case the shared memory is not big enough to
|
||||
* store the accumulators.
|
||||
* - device_memory_size_per_block: Memory size needed to store all accumulators for a single block.
|
||||
* - glwe_dim: This is k.
|
||||
* - polynomial_size: size of the polynomials. This is N.
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - ggsw_idx: The index of the GGSW we will use.
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params, sharedMemDegree SMD>
|
||||
__global__ void device_batch_cmux(
|
||||
Torus *glwe_out, Torus* glwe_in, double2 *ggsw_in,
|
||||
char *device_mem, size_t device_memory_size_per_block,
|
||||
uint32_t glwe_dim, uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget,
|
||||
uint32_t ggsw_idx){
|
||||
|
||||
int cmux_idx = blockIdx.x;
|
||||
int output_idx = cmux_idx;
|
||||
int input_idx1 = (cmux_idx << 1);
|
||||
int input_idx2 = (cmux_idx << 1) + 1;
|
||||
|
||||
// We use shared memory for intermediate result
|
||||
extern __shared__ char sharedmem[];
|
||||
char *selected_memory;
|
||||
|
||||
if constexpr (SMD == FULLSM)
|
||||
selected_memory = sharedmem;
|
||||
else
|
||||
selected_memory = &device_mem[blockIdx.x * device_memory_size_per_block];
|
||||
|
||||
cmux<Torus, STorus, params>(
|
||||
glwe_out, glwe_in, ggsw_in,
|
||||
selected_memory,
|
||||
output_idx, input_idx1, input_idx2,
|
||||
glwe_dim, polynomial_size,
|
||||
base_log, l_gadget,
|
||||
ggsw_idx);
|
||||
|
||||
}
|
||||
/*
|
||||
* This kernel executes the CMUX tree used by the hybrid packing of the WoPBS.
|
||||
*
|
||||
* Uses shared memory for intermediate results
|
||||
*
|
||||
* - v_stream: The CUDA stream that should be used.
|
||||
* - glwe_out: A device array for the output GLWE ciphertext.
|
||||
* - ggsw_in: A device array for the GGSW ciphertexts used in each layer.
|
||||
* - lut_vector: A device array for the GLWE ciphertexts used in the first layer.
|
||||
* - polynomial_size: size of the polynomials. This is N.
|
||||
* - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
|
||||
* - l_gadget: number of decomposition levels in the gadget matrix (~4)
|
||||
* - r: Number of layers in the tree.
|
||||
*/
|
||||
template <typename Torus, typename STorus, class params>
|
||||
void host_cmux_tree(
|
||||
void *v_stream,
|
||||
Torus *glwe_out,
|
||||
Torus *ggsw_in,
|
||||
Torus *lut_vector,
|
||||
uint32_t glwe_dimension,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t base_log,
|
||||
uint32_t l_gadget,
|
||||
uint32_t r,
|
||||
uint32_t max_shared_memory) {
|
||||
|
||||
assert(glwe_dimension == 1); // For larger k we will need to adjust the mask size
|
||||
assert(r >= 1);
|
||||
|
||||
auto stream = static_cast<cudaStream_t *>(v_stream);
|
||||
int num_lut = (1<<r);
|
||||
|
||||
cuda_initialize_twiddles(polynomial_size, 0);
|
||||
|
||||
int memory_needed_per_block =
|
||||
sizeof(Torus) * polynomial_size + // glwe_sub_mask
|
||||
sizeof(Torus) * polynomial_size + // glwe_sub_body
|
||||
sizeof(int16_t) * polynomial_size + // glwe_mask_decomposed
|
||||
sizeof(int16_t) * polynomial_size + // glwe_body_decomposed
|
||||
sizeof(double2) * polynomial_size/2 + // mask_res_fft
|
||||
sizeof(double2) * polynomial_size/2 + // body_res_fft
|
||||
sizeof(double2) * polynomial_size/2; // glwe_fft
|
||||
|
||||
dim3 thds(polynomial_size / params::opt, 1, 1);
|
||||
|
||||
//////////////////////
|
||||
// std::cout << "Applying the FFT on m^tree" << std::endl;
|
||||
double2 *d_ggsw_fft_in;
|
||||
int ggsw_size = r * polynomial_size * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
|
||||
checkCudaErrors(cudaMalloc((void **)&d_ggsw_fft_in, ggsw_size * sizeof(double)));
|
||||
|
||||
batch_fft_ggsw_vector<Torus, STorus, params>(
|
||||
d_ggsw_fft_in, ggsw_in, r, glwe_dimension, polynomial_size, l_gadget);
|
||||
|
||||
//////////////////////
|
||||
|
||||
// Allocate global memory in case parameters are too large
|
||||
char *d_mem;
|
||||
if (max_shared_memory < memory_needed_per_block) {
|
||||
checkCudaErrors(cudaMalloc((void **) &d_mem, memory_needed_per_block * (1 << (r - 1))));
|
||||
}else{
|
||||
checkCudaErrors(cudaFuncSetAttribute(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncAttributeMaxDynamicSharedMemorySize,
|
||||
memory_needed_per_block));
|
||||
// TODO (Agnes): is this necessary?
|
||||
checkCudaErrors(cudaFuncSetCacheConfig(
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>,
|
||||
cudaFuncCachePreferShared));
|
||||
}
|
||||
|
||||
// Allocate buffers
|
||||
int glwe_size = (glwe_dimension + 1) * polynomial_size;
|
||||
Torus *d_buffer1, *d_buffer2;
|
||||
checkCudaErrors(cudaMalloc((void **)&d_buffer1, num_lut * glwe_size * sizeof(Torus)));
|
||||
checkCudaErrors(cudaMalloc((void **)&d_buffer2, num_lut * glwe_size * sizeof(Torus)));
|
||||
checkCudaErrors(cudaMemcpyAsync(
|
||||
d_buffer1, lut_vector,
|
||||
num_lut * glwe_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice, *stream));
|
||||
|
||||
Torus *output;
|
||||
// Run the cmux tree
|
||||
for(int layer_idx = 0; layer_idx < r; layer_idx++){
|
||||
output = (layer_idx % 2? d_buffer1 : d_buffer2);
|
||||
Torus *input = (layer_idx % 2? d_buffer2 : d_buffer1);
|
||||
|
||||
int num_cmuxes = (1<<(r-1-layer_idx));
|
||||
dim3 grid(num_cmuxes, 1, 1);
|
||||
|
||||
// walks horizontally through the leafs
|
||||
if(max_shared_memory < memory_needed_per_block)
|
||||
device_batch_cmux<Torus, STorus, params, NOSM>
|
||||
<<<grid, thds, memory_needed_per_block, *stream>>>(
|
||||
output, input, d_ggsw_fft_in,
|
||||
d_mem, memory_needed_per_block,
|
||||
glwe_dimension, // k
|
||||
polynomial_size, base_log, l_gadget,
|
||||
layer_idx // r
|
||||
);
|
||||
else
|
||||
device_batch_cmux<Torus, STorus, params, FULLSM>
|
||||
<<<grid, thds, memory_needed_per_block, *stream>>>(
|
||||
output, input, d_ggsw_fft_in,
|
||||
d_mem, memory_needed_per_block,
|
||||
glwe_dimension, // k
|
||||
polynomial_size, base_log, l_gadget,
|
||||
layer_idx // r
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
checkCudaErrors(cudaStreamSynchronize(*stream));
|
||||
checkCudaErrors(cudaMemcpy(
|
||||
glwe_out, output,
|
||||
(glwe_dimension+1) * polynomial_size * sizeof(Torus),
|
||||
cudaMemcpyDeviceToDevice));
|
||||
|
||||
checkCudaErrors(cudaDeviceSynchronize());
|
||||
|
||||
// Free memory
|
||||
checkCudaErrors(cudaFree(d_ggsw_fft_in));
|
||||
checkCudaErrors(cudaFree(d_buffer1));
|
||||
checkCudaErrors(cudaFree(d_buffer2));
|
||||
if(max_shared_memory < memory_needed_per_block)
|
||||
checkCudaErrors(cudaFree(d_mem));
|
||||
|
||||
}
|
||||
#endif //WO_PBS_H
|
||||
@@ -13,16 +13,18 @@ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
|
||||
return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
|
||||
}
|
||||
|
||||
__device__ double2*
|
||||
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
template <typename T>
|
||||
__device__ T*
|
||||
get_ith_mask_kth_block(T* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
k * polynomial_size / 2 * (glwe_dimension + 1)];
|
||||
}
|
||||
|
||||
__device__ double2*
|
||||
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
template <typename T>
|
||||
__device__ T*
|
||||
get_ith_body_kth_block(T *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget) {
|
||||
return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
|
||||
level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
|
||||
@@ -154,4 +156,25 @@ void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
|
||||
glwe_dim, l_gadget, polynomial_size);
|
||||
}
|
||||
|
||||
|
||||
// We need these lines so the compiler knows how to specialize these functions
|
||||
template __device__ uint64_t*
|
||||
get_ith_mask_kth_block(uint64_t* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ uint32_t*
|
||||
get_ith_mask_kth_block(uint32_t* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ double2*
|
||||
get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ uint64_t*
|
||||
get_ith_body_kth_block(uint64_t *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ uint32_t*
|
||||
get_ith_body_kth_block(uint32_t *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
template __device__ double2*
|
||||
get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
|
||||
int glwe_dimension, uint32_t l_gadget);
|
||||
|
||||
#endif // CNCRT_BSK_H
|
||||
|
||||
61
src/crypto/ggsw.cuh
Normal file
61
src/crypto/ggsw.cuh
Normal file
@@ -0,0 +1,61 @@
|
||||
#ifndef CONCRETE_CORE_GGSW_CUH
|
||||
#define CONCRETE_CORE_GGSW_CUH
|
||||
|
||||
template <typename T, typename ST, class params>
|
||||
__global__ void batch_fft_ggsw_vectors(double2 *dest, T *src){
|
||||
|
||||
extern __shared__ char sharedmem[];
|
||||
|
||||
double2 *shared_output = (double2*) sharedmem;
|
||||
|
||||
// Compression
|
||||
int offset = blockIdx.x * blockDim.x;
|
||||
int tid = threadIdx.x;
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt >> 1; i++) {
|
||||
ST x = src[(2 * tid) + params::opt * offset];
|
||||
ST y = src[(2 * tid + 1) + params::opt * offset];
|
||||
shared_output[tid].x = x / (double)std::numeric_limits<T>::max();
|
||||
shared_output[tid].y = y / (double)std::numeric_limits<T>::max();
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Switch to the FFT space
|
||||
NSMFFT_direct<HalfDegree<params>>(shared_output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
correction_direct_fft_inplace<params>(shared_output);
|
||||
synchronize_threads_in_block();
|
||||
|
||||
// Write the output to global memory
|
||||
tid = threadIdx.x;
|
||||
for (int j = 0; j < params::opt >> 1; j++) {
|
||||
dest[tid + (params::opt >> 1) * offset] = shared_output[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the FFT transform on sequence of GGSW ciphertexts already in the global memory
|
||||
*/
|
||||
template <typename T, typename ST, class params>
|
||||
void batch_fft_ggsw_vector(double2 *dest, T *src,
|
||||
uint32_t r,
|
||||
uint32_t glwe_dim,
|
||||
uint32_t polynomial_size,
|
||||
uint32_t l_gadget) {
|
||||
|
||||
int shared_memory_size = sizeof(double) * polynomial_size;
|
||||
|
||||
int total_polynomials = r * (glwe_dim + 1) * (glwe_dim + 1) * l_gadget;
|
||||
int gridSize = total_polynomials;
|
||||
int blockSize = polynomial_size / params::opt;
|
||||
|
||||
batch_fft_ggsw_vectors<T, ST, params><<<gridSize, blockSize, shared_memory_size>>>(dest, src);
|
||||
checkCudaErrors(cudaGetLastError());
|
||||
|
||||
}
|
||||
|
||||
|
||||
#endif //CONCRETE_CORE_GGSW_CUH
|
||||
@@ -17,6 +17,22 @@ __device__ void real_to_complex_compressed(T *src, double2 *dst) {
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename ST, class params>
|
||||
__device__ void real_to_complex_compressed(T *src, double2 *dst) {
|
||||
int tid = threadIdx.x;
|
||||
|
||||
#pragma unroll
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
ST x = src[2 * tid];
|
||||
ST y = src[2 * tid + 1];
|
||||
|
||||
dst[tid].x = x / (double)std::numeric_limits<T>::max();
|
||||
dst[tid].y = y / (double)std::numeric_limits<T>::max();
|
||||
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* copy source polynomial to specific slice of batched polynomials
|
||||
* used only in low latency version
|
||||
@@ -158,18 +174,14 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
|
||||
double frac = v1 - floor(v1);
|
||||
frac *= mx;
|
||||
double carry = frac - floor(frac);
|
||||
double zero_point_five = 1. / 2.;
|
||||
if (carry >= zero_point_five)
|
||||
frac++;
|
||||
|
||||
frac += (carry >= 0.5);
|
||||
|
||||
Torus V1 = typecast_double_to_torus<Torus>(frac);
|
||||
|
||||
frac = v2 - floor(v2);
|
||||
frac *= mx;
|
||||
carry = frac - floor(v2);
|
||||
if (carry >= zero_point_five)
|
||||
frac++;
|
||||
frac += (carry >= 0.5);
|
||||
|
||||
Torus V2 = typecast_double_to_torus<Torus>(frac);
|
||||
|
||||
|
||||
@@ -5,6 +5,15 @@
|
||||
#include "parameters.cuh"
|
||||
#include "polynomial.cuh"
|
||||
|
||||
template <typename FT, class params>
|
||||
__device__ void sub_polynomial(FT *result, FT *first, FT *second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt; i++) {
|
||||
result[tid] = first[tid] - second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename FT>
|
||||
__device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
|
||||
FT *second) {
|
||||
@@ -62,4 +71,14 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
}
|
||||
}
|
||||
|
||||
template <class params, typename T>
|
||||
__device__ void polynomial_product_accumulate_in_fourier_domain(
|
||||
T *result, T *first, T *second) {
|
||||
int tid = threadIdx.x;
|
||||
for (int i = 0; i < params::opt / 2; i++) {
|
||||
result[tid] += first[tid] * second[tid];
|
||||
tid += params::degree / params::opt;
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CNCRT_POLYNOMIAL_MATH_H
|
||||
|
||||
Reference in New Issue
Block a user