chore(cuda): Remove old comments from concrete-cuda.

This commit is contained in:
Pedro Alves
2022-10-06 17:39:24 +02:00
committed by Agnès Leroy
parent 01ea1cf2f2
commit 3d6524ccf3
11 changed files with 10 additions and 149 deletions

View File

@@ -12,7 +12,6 @@
#include "../include/helper_cuda.h"
#include "bootstrap.h"
#include "complex/operations.cuh"
//#include "crypto/bootstrapping_key.cuh"
#include "crypto/gadget.cuh"
#include "crypto/torus.cuh"
#include "fft/bnsmfft.cuh"
@@ -83,7 +82,6 @@ __global__ void device_bootstrap_amortized(
// polynomials take coefficients between -B/2 and B/2 they can be represented
// with only 16 bits, assuming the base log does not exceed 2^16
int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory;
// TODO (Agnes) why not the 16 bits representation here?
int16_t *accumulator_body_decomposed =
(int16_t *)accumulator_mask_decomposed + polynomial_size;
Torus *accumulator_mask = (Torus *)accumulator_body_decomposed +
@@ -103,28 +101,11 @@ __global__ void device_bootstrap_amortized(
accumulator_fft =
(double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
/*
int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory);
int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed);
int dif2 = ((char*)accumulator_body - (char*)accumulator_mask);
int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body);
int dif4 = ((char*)accumulator_body_rotated -
(char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft -
(char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft -
(char*)mask_res_fft); int dif7 = (SMD != PARTIALSM)? (char*)accumulator_fft -
(char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) {
printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2,
dif3, dif4, dif5, dif6, dif7);
}
*/
auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
Torus *block_lut_vector =
&lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
// TODO (Agnes) try to store the gadget matrix in const memory to see if
// register use decreases Since all const mem is used for twiddles currently,
// it would mean moving some of them to global memory instead
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
// Put "b", the body, in [0, 2N[
@@ -145,7 +126,6 @@ __global__ void device_bootstrap_amortized(
// into l_gadget polynomials, and performing polynomial multiplication
// via an FFT with the RGSW encrypted secret key
for (int iteration = 0; iteration < lwe_mask_size; iteration++) {
// TODO make sure that following sync is necessary
synchronize_threads_in_block();
// Put "a" in [0, 2N[ instead of Zq
@@ -153,18 +133,6 @@ __global__ void device_bootstrap_amortized(
block_lwe_in[iteration],
2 * params::degree); // 2 * params::log2_degree + 1);
// TODO (Agnes) why is there this if condition?
if (a_hat == 0) {
// todo(Joao): **cannot use this optimization**
// the reason is that one of the input ciphertexts (blockIdx.z)
// might skip an iteration while others don't, which as a result
// will make that block not call the grid.sync(), causing a deadlock;
// maybe it's a workaround to add grid.sync() here, but not sure if
// there are any edge cases?
// continue
}
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
@@ -200,7 +168,6 @@ __global__ void device_bootstrap_amortized(
// Now that the rotation is done, decompose the resulting polynomial
// coefficients so as to multiply each decomposed level with the
// corresponding part of the bootstrapping key
// TODO (Agnes) explain why we do that for the mask and body separately
for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
gadget.decompose_one_level(accumulator_mask_decomposed,
@@ -227,8 +194,6 @@ __global__ void device_bootstrap_amortized(
// Get the bootstrapping key piece necessary for the multiplication
// It is already in the Fourier domain
// TODO (Agnes) Explain why for the mask polynomial multiplication
// we need the bsk_body_slice and vice versa
auto bsk_mask_slice = PolynomialFourier<double2, params>(
get_ith_mask_kth_block(
bootstrapping_key, iteration, 0, decomp_level,
@@ -241,7 +206,7 @@ __global__ void device_bootstrap_amortized(
synchronize_threads_in_block();
// Perform the coefficient-wise product with the two pieces of
// bootstrapping key TODO (Agnes) why two pieces?
// bootstrapping key
polynomial_product_accumulate_in_fourier_domain(
mask_res_fft, accumulator_fft, bsk_mask_slice);
polynomial_product_accumulate_in_fourier_domain(
@@ -333,7 +298,7 @@ __global__ void device_bootstrap_amortized(
// The blind rotation for this block is over
// Now we can perform the sample extraction: for the body it's just
// the resulting constant coefficient of the accumulator
// For the mask it's more complicated TODO (Agnes) explain why
// For the mask it's more complicated
sample_extract_mask<Torus, params>(block_lwe_out, accumulator_mask);
sample_extract_body<Torus, params>(block_lwe_out, accumulator_body);
}
@@ -380,11 +345,6 @@ __host__ void host_bootstrap_amortized(
// handles opt polynomial coefficients
// (actually opt/2 coefficients since we compress the real polynomial into a
// complex)
// TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512
// probably, maybe 1024 would be too big?
// Or would it actually be good in our case to have the largest possible
// number of threads per block since anyway few blocks will run
// concurrently?
dim3 grid(input_lwe_ciphertext_count, 1, 1);
dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -426,7 +386,6 @@ __host__ void host_bootstrap_amortized(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
SM_FULL));
// TODO (Agnes): is this necessary?
checkCudaErrors(cudaFuncSetCacheConfig(
device_bootstrap_amortized<Torus, params, FULLSM>,
cudaFuncCachePreferShared));
@@ -454,7 +413,6 @@ int cuda_get_pbs_per_gpu(int polynomial_size) {
int num_threads = polynomial_size / params::opt;
cudaGetDeviceCount(0);
cudaDeviceProp device_properties;
// FIXME: here we assume every device has same properties
cudaGetDeviceProperties(&device_properties, 0);
cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&blocks_per_sm, device_bootstrap_amortized<Torus, params>,

View File

@@ -48,12 +48,11 @@
* - switch to the FFT domain
* - multiply with the bootstrapping key
* - come back to the coefficients representation
* - between each stage a synchronization of the threads is necessary TODO
* (Agnes) check this
* - between each stage a synchronization of the threads is necessary
* - in case the device has enough shared memory, temporary arrays used for
* the different stages (accumulators) are stored into the shared memory
* - the accumulators serve to combine the results for all decomposition
* levels TODO (Agnes) check this
* levels
* - the constant memory (64K) is used for storing the roots of identity
* values for the FFT
*/

View File

@@ -23,8 +23,7 @@
#include "utils/memory.cuh"
#include "utils/timer.cuh"
// Cooperative groups are used in the low latency
// version of the bootstrapping
// Cooperative groups are used in the low latency PBS
using namespace cooperative_groups;
namespace cg = cooperative_groups;
@@ -58,11 +57,6 @@ mul_trgsw_trlwe(Torus *accumulator,
// needed to perform the external product in this block (corresponding to
// the same decomposition level)
// auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block(
// gpu_num, iteration, blockIdx.y, blockIdx.x);
// auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block(
// gpu_num, iteration, blockIdx.y, blockIdx.x);
auto bsk_mask_slice = PolynomialFourier<double2, params>(
get_ith_mask_kth_block(
bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
@@ -195,7 +189,6 @@ __global__ void device_bootstrap_low_latency(
// Since the space is L1 cache is small, we use the same memory location for
// the rotated accumulator and the fft accumulator, since we know that the
// rotated array is not in use anymore by the time we perform the fft
GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
// Put "b" in [0, 2N[
@@ -222,17 +215,6 @@ __global__ void device_bootstrap_low_latency(
block_lwe_in[i],
2 * params::degree); // 2 * params::log2_degree + 1);
if (a_hat == 0) {
// todo(Joao): **cannot use this optimization**
// the reason is that one of the input ciphertexts (blockIdx.z)
// might skip an iteration while others don't, which as a result
// will make that block not call the grid.sync(), causing a deadlock;
// maybe it's a workaround to add grid.sync() here, but not sure if
// there are any edge cases?
// continue
}
// Perform ACC * (X^ä - 1)
multiply_by_monomial_negacyclic_and_sub_polynomial<
Torus, params::opt, params::degree / params::opt>(
@@ -245,8 +227,6 @@ __global__ void device_bootstrap_low_latency(
params::degree / params::opt>(
accumulator_rotated, base_log, l_gadget);
// Decompose the accumulator. Each block gets one level of the
// decomposition, for the mask and the body (so block 0 will have the
// accumulator decomposed at level 0, 1 at 1, etc.)

View File

@@ -337,7 +337,6 @@ void host_cmux_tree(
device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncAttributeMaxDynamicSharedMemorySize,
memory_needed_per_block));
// TODO (Agnes): is this necessary?
checkCudaErrors(cudaFuncSetCacheConfig(
device_batch_cmux<Torus, STorus, params, FULLSM>,
cudaFuncCachePreferShared));

View File

@@ -83,9 +83,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
int gridSize = total_polynomials;
int blockSize = polynomial_size / choose_opt(polynomial_size);
// todo(Joao): let's use cudaMallocHost here,
// since it allocates page-staged memory which allows
// faster data copy
double2 *h_bsk = (double2 *)malloc(buffer_size);
double2 *d_bsk;
cudaMalloc((void **)&d_bsk, buffer_size);
@@ -110,7 +108,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
auto stream = static_cast<cudaStream_t *>(v_stream);
switch (polynomial_size) {
// FIXME (Agnes): check if polynomial sizes are ok
case 512:
batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
<<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);

View File

@@ -36,8 +36,6 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
template <typename T>
__device__ __forceinline__ T rescale_torus_element(T element,
uint32_t log_shift) {
// todo(Joao): not sure if this works
// return element >> log_shift;
return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
(double)log_shift);
}

View File

@@ -80,7 +80,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance TODO (Agnes) check this
* full loop, which should increase performance
*/
bit_reverse_inplace<params>(A);
__syncthreads();
@@ -113,8 +113,6 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
// between groups of 4 coefficients
// k=2, \zeta=exp(i pi/4) for even coefficients and
// exp(3 i pi / 4) for odd coefficients
// TODO (Agnes) how does this work on the gpu? aren't we doing
// a lot more computations than we should?
tid = threadIdx.x;
// odd = 0 for even coefficients, 1 for odd coefficients
int odd = tid & 1;
@@ -371,7 +369,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
* Each thread is always in charge of "opt/2" pairs of coefficients,
* which is why we always loop through N/2 by N/opt strides
* The pragma unroll instruction tells the compiler to unroll the
* full loop, which should increase performance TODO (Agnes) check this
* full loop, which should increase performance
*/
int tid;
int i1, i2;
@@ -589,8 +587,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
// between groups of 4 coefficients
// k=2, \zeta=exp(i pi/4) for even coefficients and
// exp(3 i pi / 4) for odd coefficients
// TODO (Agnes) how does this work on the gpu? aren't we doing
// a lot more computations than we should?
tid = threadIdx.x;
// odd = 0 for even coefficients, 1 for odd coefficients
int odd = tid & 1;
@@ -602,7 +598,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
i1 = (tid << 1) - odd;
i2 = i1 + 2;
// TODO(Beka) optimize twiddle multiplication
double2 w;
if (odd) {
w.x = -0.707106781186547461715008466854;
@@ -629,7 +624,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
// of coefficients, with a stride of 2
i1 = tid << 1;
i2 = i1 + 1;
// TODO(Beka) optimize twiddle multiplication
double2 w = {0, -1};
u = A[i1], v = A[i2];
A[i1] = (u + v) * 0.5;

View File

@@ -2,14 +2,6 @@
#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
#define GPU_BOOTSTRAP_TWIDDLES_CUH
// TODO (Agnes) depending on the device architecture
// can we make more of them __constant__?
// Do we have to define them all regardless of the
// polynomial degree and q values?
// TODO (Beka) make those two arrays with dynamic size
// or find exact maximum for 8192 length poly it shuld
// be less than 2048
extern __constant__ short SW1[2048];
extern __constant__ short SW2[2048];

View File

@@ -136,8 +136,6 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou
lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
}
// int lwe_size_before =
// (lwe_dimension_before + 1) * num_samples;
int lwe_size_after =
(lwe_dimension_after + 1) * num_samples;

View File

@@ -166,7 +166,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
int tid = threadIdx.x;
#pragma unroll
// TODO (Beka) check if better memory access is possible
for (int i = 0; i < params::opt / 2; i++) {
double v1 = m_values[tid].x;
double v2 = m_values[tid].y;
@@ -194,8 +193,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
template <typename Torus, class params>
__device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) {
// Set first coefficient of the accumulator as the body of the LWE sample
// todo(Joao): not every thread needs to set it
// if (threadIdx.x == 0)
lwe_out[params::degree] = accumulator[0];
}

View File

@@ -50,8 +50,7 @@ public:
int chunk_size) {
int pos = chunk_num * chunk_size;
T *ptr = &m_data[pos];
// todo(Joao): unsafe, user must pass chunk that has size multiple of
// polynomial degree
return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
}
@@ -88,8 +87,6 @@ public:
synchronize_threads_in_block();
}
// todo(Joao): we need to make these APIs more clear, as it's confusing what's
// being copied where
__device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
int i) {
int tid = threadIdx.x;
@@ -160,22 +157,6 @@ public:
}
}
/*
__device__ void add_polynomial_inplace(PolynomialFourier<T, params> &source,
int begin) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt / 2; i++) {
this->m_values[tid] += source.m_values[tid + begin];
tid = tid + params::degree / params::opt;
}
if (threadIdx.x == 0) {
this->m_values[params::degree / 2] += source.m_values[params::degree / 2 +
begin];
}
}
*/
__device__ void swap_quarters_inplace() {
int tid = threadIdx.x;
int s1 = params::quarter;
@@ -202,20 +183,6 @@ begin];
}
}
__device__ void
forward_negacyclic_fft_inplace(PolynomialFourier<double2, params> &X) {
// TODO function should be removed
}
__device__ void inverse_negacyclic_fft_inplace() {
// TODO function should be removed
}
template <typename Torus>
__device__ void add_to_torus(Polynomial<Torus, params> &result) {
// TODO function should be removed
}
__device__ T &operator[](int i) { return m_values[i]; }
};
@@ -474,18 +441,6 @@ public:
}
}
/*
__device__ void add_polynomial_inplace(Polynomial<T, params> &source,
int begin) {
int tid = threadIdx.x;
#pragma unroll
for (int i = 0; i < params::opt; i++) {
this->coefficients[tid] += source.coefficients[tid + begin];
tid = tid + params::degree / params::opt;
}
}
*/
__device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
int tid = threadIdx.x;
const int grid_dim = blockDim.x;
@@ -543,11 +498,6 @@ public:
synchronize_threads_in_block();
}
template <typename FT>
__device__ void
forward_negacyclic_fft_half(PolynomialFourier<FT, params> &result) {
// TODO function should be removed
}
};
template <typename T, class params> class Vector {
public:
@@ -624,7 +574,6 @@ public:
__device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
// todo(Joao): let's do coalesced access here at some point
__device__ void operator-=(const Vector<T, params> &rhs) {
assert(m_size == rhs->m_size);
int tid = threadIdx.x;