chore(cuda): Remove old comments from concrete-cuda.

2026-02-08 19:44:57 -05:00 · 2022-10-06 17:39:24 +02:00
parent 01ea1cf2f2
commit 3d6524ccf3
11 changed files with 10 additions and 149 deletions
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -12,7 +12,6 @@
 #include "../include/helper_cuda.h"
 #include "bootstrap.h"
 #include "complex/operations.cuh"
-//#include "crypto/bootstrapping_key.cuh"
 #include "crypto/gadget.cuh"
 #include "crypto/torus.cuh"
 #include "fft/bnsmfft.cuh"
@@ -83,7 +82,6 @@ __global__ void device_bootstrap_amortized(
  // polynomials take coefficients between -B/2 and B/2 they can be represented
  // with only 16 bits, assuming the base log does not exceed 2^16
  int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory;
-  // TODO (Agnes) why not the 16 bits representation here?
  int16_t *accumulator_body_decomposed =
      (int16_t *)accumulator_mask_decomposed + polynomial_size;
  Torus *accumulator_mask = (Torus *)accumulator_body_decomposed +
@@ -103,28 +101,11 @@ __global__ void device_bootstrap_amortized(
    accumulator_fft =
        (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);

-  /*
-  int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory);
-  int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed);
-  int dif2 = ((char*)accumulator_body - (char*)accumulator_mask);
-  int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body);
-  int dif4 = ((char*)accumulator_body_rotated -
-  (char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft -
-  (char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft -
-  (char*)mask_res_fft); int dif7 =  (SMD != PARTIALSM)? (char*)accumulator_fft -
-  (char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) {
-    printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2,
-  dif3, dif4, dif5, dif6, dif7);
-  }
-  */
-
  auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
  Torus *block_lut_vector =
      &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];

-  // TODO (Agnes) try to store the gadget matrix in const memory to see if
-  // register use decreases Since all const mem is used for twiddles currently,
-  // it would mean moving some of them to global memory instead
+
  GadgetMatrix<Torus, params> gadget(base_log, l_gadget);

  // Put "b", the body, in [0, 2N[
@@ -145,7 +126,6 @@ __global__ void device_bootstrap_amortized(
  // into l_gadget polynomials, and performing polynomial multiplication
  // via an FFT with the RGSW encrypted secret key
  for (int iteration = 0; iteration < lwe_mask_size; iteration++) {
-    // TODO make sure that following sync is necessary
    synchronize_threads_in_block();

    // Put "a" in [0, 2N[ instead of Zq
@@ -153,18 +133,6 @@ __global__ void device_bootstrap_amortized(
        block_lwe_in[iteration],
        2 * params::degree); // 2 * params::log2_degree + 1);

-    // TODO (Agnes) why is there this if condition?
-    if (a_hat == 0) {
-      // todo(Joao): **cannot use this optimization**
-      // the reason is that one of the input ciphertexts (blockIdx.z)
-      // might skip an iteration while others don't, which as a result
-      // will make that block not call the grid.sync(), causing a deadlock;
-      // maybe it's a workaround to add grid.sync() here, but not sure if
-      // there are any edge cases?
-
-      // continue
-    }
-
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
        Torus, params::opt, params::degree / params::opt>(
@@ -200,7 +168,6 @@ __global__ void device_bootstrap_amortized(
    // Now that the rotation is done, decompose the resulting polynomial
    // coefficients so as to multiply each decomposed level with the
    // corresponding part of the bootstrapping key
-    // TODO (Agnes) explain why we do that for the mask and body separately
    for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {

      gadget.decompose_one_level(accumulator_mask_decomposed,
@@ -227,8 +194,6 @@ __global__ void device_bootstrap_amortized(

      // Get the bootstrapping key piece necessary for the multiplication
      // It is already in the Fourier domain
-      // TODO (Agnes) Explain why for the mask polynomial multiplication
-      // we need the bsk_body_slice and vice versa
      auto bsk_mask_slice = PolynomialFourier<double2, params>(
          get_ith_mask_kth_block(
              bootstrapping_key, iteration, 0, decomp_level,
@@ -241,7 +206,7 @@ __global__ void device_bootstrap_amortized(
      synchronize_threads_in_block();

      // Perform the coefficient-wise product with the two pieces of
-      // bootstrapping key TODO (Agnes) why two pieces?
+      // bootstrapping key
      polynomial_product_accumulate_in_fourier_domain(
          mask_res_fft, accumulator_fft, bsk_mask_slice);
      polynomial_product_accumulate_in_fourier_domain(
@@ -333,7 +298,7 @@ __global__ void device_bootstrap_amortized(
  // The blind rotation for this block is over
  // Now we can perform the sample extraction: for the body it's just
  // the resulting constant coefficient of the accumulator
-  // For the mask it's more complicated TODO (Agnes) explain why
+  // For the mask it's more complicated
  sample_extract_mask<Torus, params>(block_lwe_out, accumulator_mask);
  sample_extract_body<Torus, params>(block_lwe_out, accumulator_body);
 }
@@ -380,11 +345,6 @@ __host__ void host_bootstrap_amortized(
  // handles opt polynomial coefficients
  // (actually opt/2 coefficients since we compress the real polynomial into a
  // complex)
-  // TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512
-  //   probably, maybe 1024 would be too big?
-  //   Or would it actually be good in our case to have the largest possible
-  //   number of threads per block since anyway few blocks will run
-  //   concurrently?
  dim3 grid(input_lwe_ciphertext_count, 1, 1);
  dim3 thds(polynomial_size / params::opt, 1, 1);

@@ -426,7 +386,6 @@ __host__ void host_bootstrap_amortized(
        device_bootstrap_amortized<Torus, params, FULLSM>,
        cudaFuncAttributeMaxDynamicSharedMemorySize,
        SM_FULL));
-    // TODO (Agnes): is this necessary?
    checkCudaErrors(cudaFuncSetCacheConfig(
        device_bootstrap_amortized<Torus, params, FULLSM>,
        cudaFuncCachePreferShared));
@@ -454,7 +413,6 @@ int cuda_get_pbs_per_gpu(int polynomial_size) {
  int num_threads = polynomial_size / params::opt;
  cudaGetDeviceCount(0);
  cudaDeviceProp device_properties;
-  // FIXME: here we assume every device has same properties
  cudaGetDeviceProperties(&device_properties, 0);
  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
      &blocks_per_sm, device_bootstrap_amortized<Torus, params>,
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -48,12 +48,11 @@
 * 		  - switch to the FFT domain
 * 		  - multiply with the bootstrapping key
 * 		  - come back to the coefficients representation
- * 	- between each stage a synchronization of the threads is necessary TODO
- * (Agnes) check this
+ * 	- between each stage a synchronization of the threads is necessary
 * 	- in case the device has enough shared memory, temporary arrays used for
 * the different stages (accumulators) are stored into the shared memory
 * 	- the accumulators serve to combine the results for all decomposition
- * levels TODO (Agnes) check this
+ * levels
 * 	- the constant memory (64K) is used for storing the roots of identity
 * values for the FFT
 */
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -23,8 +23,7 @@
 #include "utils/memory.cuh"
 #include "utils/timer.cuh"

-// Cooperative groups are used in the low latency
-// version of the bootstrapping
+// Cooperative groups are used in the low latency PBS
 using namespace cooperative_groups;
 namespace cg = cooperative_groups;

@@ -58,11 +57,6 @@ mul_trgsw_trlwe(Torus *accumulator,
  // needed to perform the external product in this block (corresponding to
  // the same decomposition level)

-//  auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block(
-//      gpu_num, iteration, blockIdx.y, blockIdx.x);
-//  auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block(
-//      gpu_num, iteration, blockIdx.y, blockIdx.x);
-
    auto bsk_mask_slice = PolynomialFourier<double2, params>(
      get_ith_mask_kth_block(
          bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
@@ -195,7 +189,6 @@ __global__ void device_bootstrap_low_latency(
  // Since the space is L1 cache is small, we use the same memory location for
  // the rotated accumulator and the fft accumulator, since we know that the
  // rotated array is not in use anymore by the time we perform the fft
-
  GadgetMatrix<Torus, params> gadget(base_log, l_gadget);

  // Put "b" in [0, 2N[
@@ -222,17 +215,6 @@ __global__ void device_bootstrap_low_latency(
        block_lwe_in[i],
        2 * params::degree); // 2 * params::log2_degree + 1);

-    if (a_hat == 0) {
-      // todo(Joao): **cannot use this optimization**
-      // the reason is that one of the input ciphertexts (blockIdx.z)
-      // might skip an iteration while others don't, which as a result
-      // will make that block not call the grid.sync(), causing a deadlock;
-      // maybe it's a workaround to add grid.sync() here, but not sure if
-      // there are any edge cases?
-
-      // continue
-    }
-
    // Perform ACC * (X^ä - 1)
    multiply_by_monomial_negacyclic_and_sub_polynomial<
          Torus, params::opt, params::degree / params::opt>(
@@ -245,8 +227,6 @@ __global__ void device_bootstrap_low_latency(
          params::degree / params::opt>(
          accumulator_rotated, base_log, l_gadget);

-
-
    // Decompose the accumulator. Each block gets one level of the
    // decomposition, for the mask and the body (so block 0 will have the
    // accumulator decomposed at level 0, 1 at 1, etc.)
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -337,7 +337,6 @@ void host_cmux_tree(
            device_batch_cmux<Torus, STorus, params, FULLSM>,
            cudaFuncAttributeMaxDynamicSharedMemorySize,
            memory_needed_per_block));
-        // TODO (Agnes): is this necessary?
        checkCudaErrors(cudaFuncSetCacheConfig(
            device_batch_cmux<Torus, STorus, params, FULLSM>,
            cudaFuncCachePreferShared));
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -83,9 +83,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,

  int gridSize = total_polynomials;
  int blockSize = polynomial_size / choose_opt(polynomial_size);
-  // todo(Joao): let's use cudaMallocHost here,
-  // since it allocates page-staged memory which allows
-  // faster data copy
+
  double2 *h_bsk = (double2 *)malloc(buffer_size);
  double2 *d_bsk;
  cudaMalloc((void **)&d_bsk, buffer_size);
@@ -110,7 +108,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,

  auto stream = static_cast<cudaStream_t *>(v_stream);
  switch (polynomial_size) {
-    // FIXME (Agnes): check if polynomial sizes are ok
  case 512:
    batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
    <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
--- a/src/crypto/torus.cuh
+++ b/src/crypto/torus.cuh
@@ -36,8 +36,6 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
 template <typename T>
 __device__ __forceinline__ T rescale_torus_element(T element,
                                                   uint32_t log_shift) {
-  // todo(Joao): not sure if this works
-  // return element >> log_shift;
  return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
               (double)log_shift);
 }
--- a/src/fft/bnsmfft.cuh
+++ b/src/fft/bnsmfft.cuh
@@ -80,7 +80,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
   *  Each thread is always in charge of "opt/2" pairs of coefficients,
   *  which is why we always loop through N/2 by N/opt strides
   *  The pragma unroll instruction tells the compiler to unroll the
-   *  full loop, which should increase performance TODO (Agnes) check this
+   *  full loop, which should increase performance
   */
  bit_reverse_inplace<params>(A);
  __syncthreads();
@@ -113,8 +113,6 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
  // between groups of 4 coefficients
  // k=2, \zeta=exp(i pi/4) for even coefficients and
  // exp(3 i pi / 4) for odd coefficients
-  // TODO (Agnes) how does this work on the gpu? aren't we doing
-  // a lot more computations than we should?
  tid = threadIdx.x;
  // odd = 0 for even coefficients, 1 for odd coefficients
  int odd = tid & 1;
@@ -371,7 +369,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
   *  Each thread is always in charge of "opt/2" pairs of coefficients,
   *  which is why we always loop through N/2 by N/opt strides
   *  The pragma unroll instruction tells the compiler to unroll the
-   *  full loop, which should increase performance TODO (Agnes) check this
+   *  full loop, which should increase performance
   */
  int tid;
  int i1, i2;
@@ -589,8 +587,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
  // between groups of 4 coefficients
  // k=2, \zeta=exp(i pi/4) for even coefficients and
  // exp(3 i pi / 4) for odd coefficients
-  // TODO (Agnes) how does this work on the gpu? aren't we doing
-  // a lot more computations than we should?
  tid = threadIdx.x;
  // odd = 0 for even coefficients, 1 for odd coefficients
  int odd = tid & 1;
@@ -602,7 +598,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
    i1 = (tid << 1) - odd;
    i2 = i1 + 2;

-    // TODO(Beka) optimize twiddle multiplication
    double2 w;
    if (odd) {
      w.x = -0.707106781186547461715008466854;
@@ -629,7 +624,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
    // of coefficients, with a stride of 2
    i1 = tid << 1;
    i2 = i1 + 1;
-    // TODO(Beka) optimize twiddle multiplication
    double2 w = {0, -1};
    u = A[i1], v = A[i2];
    A[i1] = (u + v) * 0.5;
--- a/src/fft/twiddles.cuh
+++ b/src/fft/twiddles.cuh
@@ -2,14 +2,6 @@
 #ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
 #define GPU_BOOTSTRAP_TWIDDLES_CUH

-// TODO (Agnes) depending on the device architecture
-// can we make more of them __constant__?
-// Do we have to define them all regardless of the
-// polynomial degree and q values?
-
-// TODO (Beka) make those two arrays with dynamic size
-// or find exact maximum for 8192 length poly it shuld
-// be less than 2048
 extern __constant__ short SW1[2048];
 extern __constant__ short SW2[2048];

--- a/src/keyswitch.cuh
+++ b/src/keyswitch.cuh
@@ -136,8 +136,6 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou
    lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
  }

-//  int lwe_size_before =
-//      (lwe_dimension_before + 1) * num_samples;
  int lwe_size_after =
      (lwe_dimension_after + 1) * num_samples;

--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -166,7 +166,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
  Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
  int tid = threadIdx.x;
 #pragma unroll
-  // TODO (Beka) check if better memory access is possible
  for (int i = 0; i < params::opt / 2; i++) {
    double v1 = m_values[tid].x;
    double v2 = m_values[tid].y;
@@ -194,8 +193,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
 template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) {
  // Set first coefficient of the accumulator as the body of the LWE sample
-  // todo(Joao): not every thread needs to set it
-  // if (threadIdx.x == 0)
  lwe_out[params::degree] = accumulator[0];
 }

--- a/src/polynomial/polynomial.cuh
+++ b/src/polynomial/polynomial.cuh
@@ -50,8 +50,7 @@ public:
                                                   int chunk_size) {
    int pos = chunk_num * chunk_size;
    T *ptr = &m_data[pos];
-    // todo(Joao): unsafe, user must pass chunk that has size multiple of
-    // polynomial degree
+
    return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
  }

@@ -88,8 +87,6 @@ public:
    synchronize_threads_in_block();
  }

-  // todo(Joao): we need to make these APIs more clear, as it's confusing what's
-  // being copied where
  __device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
                                           int i) {
    int tid = threadIdx.x;
@@ -160,22 +157,6 @@ public:
    }
  }

-  /*
-  __device__ void add_polynomial_inplace(PolynomialFourier<T, params> &source,
-                                         int begin) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      this->m_values[tid] += source.m_values[tid + begin];
-      tid = tid + params::degree / params::opt;
-    }
-    if (threadIdx.x == 0) {
-      this->m_values[params::degree / 2] += source.m_values[params::degree / 2 +
-begin];
-    }
-  }
-  */
-
  __device__ void swap_quarters_inplace() {
    int tid = threadIdx.x;
    int s1 = params::quarter;
@@ -202,20 +183,6 @@ begin];
    }
  }

-  __device__ void
-  forward_negacyclic_fft_inplace(PolynomialFourier<double2, params> &X) {
-    // TODO function should be removed
-  }
-
-  __device__ void inverse_negacyclic_fft_inplace() {
-    // TODO function should be removed
-  }
-
-  template <typename Torus>
-  __device__ void add_to_torus(Polynomial<Torus, params> &result) {
-    // TODO function should be removed
-  }
-
  __device__ T &operator[](int i) { return m_values[i]; }
 };

@@ -474,18 +441,6 @@ public:
    }
  }

-  /*
-  __device__ void add_polynomial_inplace(Polynomial<T, params> &source,
-                                         int begin) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      this->coefficients[tid] += source.coefficients[tid + begin];
-      tid = tid + params::degree / params::opt;
-    }
-  }
-  */
-
  __device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
    int tid = threadIdx.x;
    const int grid_dim = blockDim.x;
@@ -543,11 +498,6 @@ public:
    synchronize_threads_in_block();
  }

-  template <typename FT>
-  __device__ void
-  forward_negacyclic_fft_half(PolynomialFourier<FT, params> &result) {
-    // TODO function should be removed
-  }
 };
 template <typename T, class params> class Vector {
 public:
@@ -624,7 +574,6 @@ public:

  __device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }

-  // todo(Joao): let's do coalesced access here at some point
  __device__ void operator-=(const Vector<T, params> &rhs) {
    assert(m_size == rhs->m_size);
    int tid = threadIdx.x;