diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh index 522c84caf..8c4e9eec6 100644 --- a/src/bootstrap_amortized.cuh +++ b/src/bootstrap_amortized.cuh @@ -12,7 +12,6 @@ #include "../include/helper_cuda.h" #include "bootstrap.h" #include "complex/operations.cuh" -//#include "crypto/bootstrapping_key.cuh" #include "crypto/gadget.cuh" #include "crypto/torus.cuh" #include "fft/bnsmfft.cuh" @@ -83,7 +82,6 @@ __global__ void device_bootstrap_amortized( // polynomials take coefficients between -B/2 and B/2 they can be represented // with only 16 bits, assuming the base log does not exceed 2^16 int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory; - // TODO (Agnes) why not the 16 bits representation here? int16_t *accumulator_body_decomposed = (int16_t *)accumulator_mask_decomposed + polynomial_size; Torus *accumulator_mask = (Torus *)accumulator_body_decomposed + @@ -103,28 +101,11 @@ __global__ void device_bootstrap_amortized( accumulator_fft = (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2); - /* - int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory); - int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed); - int dif2 = ((char*)accumulator_body - (char*)accumulator_mask); - int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body); - int dif4 = ((char*)accumulator_body_rotated - - (char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft - - (char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft - - (char*)mask_res_fft); int dif7 = (SMD != PARTIALSM)? (char*)accumulator_fft - - (char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) { - printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2, - dif3, dif4, dif5, dif6, dif7); - } - */ - auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)]; Torus *block_lut_vector = &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2]; - // TODO (Agnes) try to store the gadget matrix in const memory to see if - // register use decreases Since all const mem is used for twiddles currently, - // it would mean moving some of them to global memory instead + GadgetMatrix gadget(base_log, l_gadget); // Put "b", the body, in [0, 2N[ @@ -145,7 +126,6 @@ __global__ void device_bootstrap_amortized( // into l_gadget polynomials, and performing polynomial multiplication // via an FFT with the RGSW encrypted secret key for (int iteration = 0; iteration < lwe_mask_size; iteration++) { - // TODO make sure that following sync is necessary synchronize_threads_in_block(); // Put "a" in [0, 2N[ instead of Zq @@ -153,18 +133,6 @@ __global__ void device_bootstrap_amortized( block_lwe_in[iteration], 2 * params::degree); // 2 * params::log2_degree + 1); - // TODO (Agnes) why is there this if condition? - if (a_hat == 0) { - // todo(Joao): **cannot use this optimization** - // the reason is that one of the input ciphertexts (blockIdx.z) - // might skip an iteration while others don't, which as a result - // will make that block not call the grid.sync(), causing a deadlock; - // maybe it's a workaround to add grid.sync() here, but not sure if - // there are any edge cases? - - // continue - } - // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< Torus, params::opt, params::degree / params::opt>( @@ -200,7 +168,6 @@ __global__ void device_bootstrap_amortized( // Now that the rotation is done, decompose the resulting polynomial // coefficients so as to multiply each decomposed level with the // corresponding part of the bootstrapping key - // TODO (Agnes) explain why we do that for the mask and body separately for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) { gadget.decompose_one_level(accumulator_mask_decomposed, @@ -227,8 +194,6 @@ __global__ void device_bootstrap_amortized( // Get the bootstrapping key piece necessary for the multiplication // It is already in the Fourier domain - // TODO (Agnes) Explain why for the mask polynomial multiplication - // we need the bsk_body_slice and vice versa auto bsk_mask_slice = PolynomialFourier( get_ith_mask_kth_block( bootstrapping_key, iteration, 0, decomp_level, @@ -241,7 +206,7 @@ __global__ void device_bootstrap_amortized( synchronize_threads_in_block(); // Perform the coefficient-wise product with the two pieces of - // bootstrapping key TODO (Agnes) why two pieces? + // bootstrapping key polynomial_product_accumulate_in_fourier_domain( mask_res_fft, accumulator_fft, bsk_mask_slice); polynomial_product_accumulate_in_fourier_domain( @@ -333,7 +298,7 @@ __global__ void device_bootstrap_amortized( // The blind rotation for this block is over // Now we can perform the sample extraction: for the body it's just // the resulting constant coefficient of the accumulator - // For the mask it's more complicated TODO (Agnes) explain why + // For the mask it's more complicated sample_extract_mask(block_lwe_out, accumulator_mask); sample_extract_body(block_lwe_out, accumulator_body); } @@ -380,11 +345,6 @@ __host__ void host_bootstrap_amortized( // handles opt polynomial coefficients // (actually opt/2 coefficients since we compress the real polynomial into a // complex) - // TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512 - // probably, maybe 1024 would be too big? - // Or would it actually be good in our case to have the largest possible - // number of threads per block since anyway few blocks will run - // concurrently? dim3 grid(input_lwe_ciphertext_count, 1, 1); dim3 thds(polynomial_size / params::opt, 1, 1); @@ -426,7 +386,6 @@ __host__ void host_bootstrap_amortized( device_bootstrap_amortized, cudaFuncAttributeMaxDynamicSharedMemorySize, SM_FULL)); - // TODO (Agnes): is this necessary? checkCudaErrors(cudaFuncSetCacheConfig( device_bootstrap_amortized, cudaFuncCachePreferShared)); @@ -454,7 +413,6 @@ int cuda_get_pbs_per_gpu(int polynomial_size) { int num_threads = polynomial_size / params::opt; cudaGetDeviceCount(0); cudaDeviceProp device_properties; - // FIXME: here we assume every device has same properties cudaGetDeviceProperties(&device_properties, 0); cudaOccupancyMaxActiveBlocksPerMultiprocessor( &blocks_per_sm, device_bootstrap_amortized, diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu index 7073def4c..b89b754c4 100644 --- a/src/bootstrap_low_latency.cu +++ b/src/bootstrap_low_latency.cu @@ -48,12 +48,11 @@ * - switch to the FFT domain * - multiply with the bootstrapping key * - come back to the coefficients representation - * - between each stage a synchronization of the threads is necessary TODO - * (Agnes) check this + * - between each stage a synchronization of the threads is necessary * - in case the device has enough shared memory, temporary arrays used for * the different stages (accumulators) are stored into the shared memory * - the accumulators serve to combine the results for all decomposition - * levels TODO (Agnes) check this + * levels * - the constant memory (64K) is used for storing the roots of identity * values for the FFT */ diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh index 5ab608e89..8b845aed5 100644 --- a/src/bootstrap_low_latency.cuh +++ b/src/bootstrap_low_latency.cuh @@ -23,8 +23,7 @@ #include "utils/memory.cuh" #include "utils/timer.cuh" -// Cooperative groups are used in the low latency -// version of the bootstrapping +// Cooperative groups are used in the low latency PBS using namespace cooperative_groups; namespace cg = cooperative_groups; @@ -58,11 +57,6 @@ mul_trgsw_trlwe(Torus *accumulator, // needed to perform the external product in this block (corresponding to // the same decomposition level) -// auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block( -// gpu_num, iteration, blockIdx.y, blockIdx.x); -// auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block( -// gpu_num, iteration, blockIdx.y, blockIdx.x); - auto bsk_mask_slice = PolynomialFourier( get_ith_mask_kth_block( bootstrapping_key, iteration, blockIdx.y, blockIdx.x, @@ -195,7 +189,6 @@ __global__ void device_bootstrap_low_latency( // Since the space is L1 cache is small, we use the same memory location for // the rotated accumulator and the fft accumulator, since we know that the // rotated array is not in use anymore by the time we perform the fft - GadgetMatrix gadget(base_log, l_gadget); // Put "b" in [0, 2N[ @@ -222,17 +215,6 @@ __global__ void device_bootstrap_low_latency( block_lwe_in[i], 2 * params::degree); // 2 * params::log2_degree + 1); - if (a_hat == 0) { - // todo(Joao): **cannot use this optimization** - // the reason is that one of the input ciphertexts (blockIdx.z) - // might skip an iteration while others don't, which as a result - // will make that block not call the grid.sync(), causing a deadlock; - // maybe it's a workaround to add grid.sync() here, but not sure if - // there are any edge cases? - - // continue - } - // Perform ACC * (X^ä - 1) multiply_by_monomial_negacyclic_and_sub_polynomial< Torus, params::opt, params::degree / params::opt>( @@ -245,8 +227,6 @@ __global__ void device_bootstrap_low_latency( params::degree / params::opt>( accumulator_rotated, base_log, l_gadget); - - // Decompose the accumulator. Each block gets one level of the // decomposition, for the mask and the body (so block 0 will have the // accumulator decomposed at level 0, 1 at 1, etc.) diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh index b0c9e44ee..11fbe31c2 100644 --- a/src/bootstrap_wop.cuh +++ b/src/bootstrap_wop.cuh @@ -337,7 +337,6 @@ void host_cmux_tree( device_batch_cmux, cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block)); - // TODO (Agnes): is this necessary? checkCudaErrors(cudaFuncSetCacheConfig( device_batch_cmux, cudaFuncCachePreferShared)); diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh index 6741312de..528fa6796 100644 --- a/src/crypto/bootstrapping_key.cuh +++ b/src/crypto/bootstrapping_key.cuh @@ -83,9 +83,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, int gridSize = total_polynomials; int blockSize = polynomial_size / choose_opt(polynomial_size); - // todo(Joao): let's use cudaMallocHost here, - // since it allocates page-staged memory which allows - // faster data copy + double2 *h_bsk = (double2 *)malloc(buffer_size); double2 *d_bsk; cudaMalloc((void **)&d_bsk, buffer_size); @@ -110,7 +108,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream, auto stream = static_cast(v_stream); switch (polynomial_size) { - // FIXME (Agnes): check if polynomial sizes are ok case 512: batch_NSMFFT, ForwardFFT>> <<>>(d_bsk, dest); diff --git a/src/crypto/torus.cuh b/src/crypto/torus.cuh index 9dca4cb73..039bb4b50 100644 --- a/src/crypto/torus.cuh +++ b/src/crypto/torus.cuh @@ -36,8 +36,6 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log, template __device__ __forceinline__ T rescale_torus_element(T element, uint32_t log_shift) { - // todo(Joao): not sure if this works - // return element >> log_shift; return round((double)element / (double(std::numeric_limits::max()) + 1.0) * (double)log_shift); } diff --git a/src/fft/bnsmfft.cuh b/src/fft/bnsmfft.cuh index f3117bda8..974b59acb 100644 --- a/src/fft/bnsmfft.cuh +++ b/src/fft/bnsmfft.cuh @@ -80,7 +80,7 @@ template __device__ void NSMFFT_direct(double2 *A) { * Each thread is always in charge of "opt/2" pairs of coefficients, * which is why we always loop through N/2 by N/opt strides * The pragma unroll instruction tells the compiler to unroll the - * full loop, which should increase performance TODO (Agnes) check this + * full loop, which should increase performance */ bit_reverse_inplace(A); __syncthreads(); @@ -113,8 +113,6 @@ template __device__ void NSMFFT_direct(double2 *A) { // between groups of 4 coefficients // k=2, \zeta=exp(i pi/4) for even coefficients and // exp(3 i pi / 4) for odd coefficients - // TODO (Agnes) how does this work on the gpu? aren't we doing - // a lot more computations than we should? tid = threadIdx.x; // odd = 0 for even coefficients, 1 for odd coefficients int odd = tid & 1; @@ -371,7 +369,7 @@ template __device__ void NSMFFT_inverse(double2 *A) { * Each thread is always in charge of "opt/2" pairs of coefficients, * which is why we always loop through N/2 by N/opt strides * The pragma unroll instruction tells the compiler to unroll the - * full loop, which should increase performance TODO (Agnes) check this + * full loop, which should increase performance */ int tid; int i1, i2; @@ -589,8 +587,6 @@ template __device__ void NSMFFT_inverse(double2 *A) { // between groups of 4 coefficients // k=2, \zeta=exp(i pi/4) for even coefficients and // exp(3 i pi / 4) for odd coefficients - // TODO (Agnes) how does this work on the gpu? aren't we doing - // a lot more computations than we should? tid = threadIdx.x; // odd = 0 for even coefficients, 1 for odd coefficients int odd = tid & 1; @@ -602,7 +598,6 @@ template __device__ void NSMFFT_inverse(double2 *A) { i1 = (tid << 1) - odd; i2 = i1 + 2; - // TODO(Beka) optimize twiddle multiplication double2 w; if (odd) { w.x = -0.707106781186547461715008466854; @@ -629,7 +624,6 @@ template __device__ void NSMFFT_inverse(double2 *A) { // of coefficients, with a stride of 2 i1 = tid << 1; i2 = i1 + 1; - // TODO(Beka) optimize twiddle multiplication double2 w = {0, -1}; u = A[i1], v = A[i2]; A[i1] = (u + v) * 0.5; diff --git a/src/fft/twiddles.cuh b/src/fft/twiddles.cuh index c776f974f..7213d3647 100644 --- a/src/fft/twiddles.cuh +++ b/src/fft/twiddles.cuh @@ -2,14 +2,6 @@ #ifndef GPU_BOOTSTRAP_TWIDDLES_CUH #define GPU_BOOTSTRAP_TWIDDLES_CUH -// TODO (Agnes) depending on the device architecture -// can we make more of them __constant__? -// Do we have to define them all regardless of the -// polynomial degree and q values? - -// TODO (Beka) make those two arrays with dynamic size -// or find exact maximum for 8192 length poly it shuld -// be less than 2048 extern __constant__ short SW1[2048]; extern __constant__ short SW2[2048]; diff --git a/src/keyswitch.cuh b/src/keyswitch.cuh index 877fb26c2..d3d45d60b 100644 --- a/src/keyswitch.cuh +++ b/src/keyswitch.cuh @@ -136,8 +136,6 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads); } -// int lwe_size_before = -// (lwe_dimension_before + 1) * num_samples; int lwe_size_after = (lwe_dimension_after + 1) * num_samples; diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh index 53988c011..572e38f82 100644 --- a/src/polynomial/functions.cuh +++ b/src/polynomial/functions.cuh @@ -166,7 +166,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) { Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX; int tid = threadIdx.x; #pragma unroll - // TODO (Beka) check if better memory access is possible for (int i = 0; i < params::opt / 2; i++) { double v1 = m_values[tid].x; double v2 = m_values[tid].y; @@ -194,8 +193,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) { template __device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) { // Set first coefficient of the accumulator as the body of the LWE sample - // todo(Joao): not every thread needs to set it - // if (threadIdx.x == 0) lwe_out[params::degree] = accumulator[0]; } diff --git a/src/polynomial/polynomial.cuh b/src/polynomial/polynomial.cuh index b1339c634..5606e093a 100644 --- a/src/polynomial/polynomial.cuh +++ b/src/polynomial/polynomial.cuh @@ -50,8 +50,7 @@ public: int chunk_size) { int pos = chunk_num * chunk_size; T *ptr = &m_data[pos]; - // todo(Joao): unsafe, user must pass chunk that has size multiple of - // polynomial degree + return VectorPolynomial(ptr, chunk_size / params::degree); } @@ -88,8 +87,6 @@ public: synchronize_threads_in_block(); } - // todo(Joao): we need to make these APIs more clear, as it's confusing what's - // being copied where __device__ void copy_into_ith_polynomial(PolynomialFourier &source, int i) { int tid = threadIdx.x; @@ -160,22 +157,6 @@ public: } } - /* - __device__ void add_polynomial_inplace(PolynomialFourier &source, - int begin) { - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt / 2; i++) { - this->m_values[tid] += source.m_values[tid + begin]; - tid = tid + params::degree / params::opt; - } - if (threadIdx.x == 0) { - this->m_values[params::degree / 2] += source.m_values[params::degree / 2 + -begin]; - } - } - */ - __device__ void swap_quarters_inplace() { int tid = threadIdx.x; int s1 = params::quarter; @@ -202,20 +183,6 @@ begin]; } } - __device__ void - forward_negacyclic_fft_inplace(PolynomialFourier &X) { - // TODO function should be removed - } - - __device__ void inverse_negacyclic_fft_inplace() { - // TODO function should be removed - } - - template - __device__ void add_to_torus(Polynomial &result) { - // TODO function should be removed - } - __device__ T &operator[](int i) { return m_values[i]; } }; @@ -474,18 +441,6 @@ public: } } - /* - __device__ void add_polynomial_inplace(Polynomial &source, - int begin) { - int tid = threadIdx.x; -#pragma unroll - for (int i = 0; i < params::opt; i++) { - this->coefficients[tid] += source.coefficients[tid + begin]; - tid = tid + params::degree / params::opt; - } - } - */ - __device__ void sub_polynomial_inplace(Polynomial &rhs) { int tid = threadIdx.x; const int grid_dim = blockDim.x; @@ -543,11 +498,6 @@ public: synchronize_threads_in_block(); } - template - __device__ void - forward_negacyclic_fft_half(PolynomialFourier &result) { - // TODO function should be removed - } }; template class Vector { public: @@ -624,7 +574,6 @@ public: __device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; } - // todo(Joao): let's do coalesced access here at some point __device__ void operator-=(const Vector &rhs) { assert(m_size == rhs->m_size); int tid = threadIdx.x;