diff --git a/src/bootstrap_amortized.cuh b/src/bootstrap_amortized.cuh
index 522c84caf..8c4e9eec6 100644
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -12,7 +12,6 @@
 #include "../include/helper_cuda.h"
 #include "bootstrap.h"
 #include "complex/operations.cuh"
-//#include "crypto/bootstrapping_key.cuh"
 #include "crypto/gadget.cuh"
 #include "crypto/torus.cuh"
 #include "fft/bnsmfft.cuh"
@@ -83,7 +82,6 @@ __global__ void device_bootstrap_amortized(
   // polynomials take coefficients between -B/2 and B/2 they can be represented
   // with only 16 bits, assuming the base log does not exceed 2^16
   int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory;
-  // TODO (Agnes) why not the 16 bits representation here?
   int16_t *accumulator_body_decomposed =
       (int16_t *)accumulator_mask_decomposed + polynomial_size;
   Torus *accumulator_mask = (Torus *)accumulator_body_decomposed +
@@ -103,28 +101,11 @@ __global__ void device_bootstrap_amortized(
     accumulator_fft =
         (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
 
-  /*
-  int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory);
-  int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed);
-  int dif2 = ((char*)accumulator_body - (char*)accumulator_mask);
-  int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body);
-  int dif4 = ((char*)accumulator_body_rotated -
-  (char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft -
-  (char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft -
-  (char*)mask_res_fft); int dif7 =  (SMD != PARTIALSM)? (char*)accumulator_fft -
-  (char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) {
-    printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2,
-  dif3, dif4, dif5, dif6, dif7);
-  }
-  */
-
   auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
   Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
 
-  // TODO (Agnes) try to store the gadget matrix in const memory to see if
-  // register use decreases Since all const mem is used for twiddles currently,
-  // it would mean moving some of them to global memory instead
+
   GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
 
   // Put "b", the body, in [0, 2N[
@@ -145,7 +126,6 @@ __global__ void device_bootstrap_amortized(
   // into l_gadget polynomials, and performing polynomial multiplication
   // via an FFT with the RGSW encrypted secret key
   for (int iteration = 0; iteration < lwe_mask_size; iteration++) {
-    // TODO make sure that following sync is necessary
     synchronize_threads_in_block();
 
     // Put "a" in [0, 2N[ instead of Zq
@@ -153,18 +133,6 @@ __global__ void device_bootstrap_amortized(
         block_lwe_in[iteration],
         2 * params::degree); // 2 * params::log2_degree + 1);
 
-    // TODO (Agnes) why is there this if condition?
-    if (a_hat == 0) {
-      // todo(Joao): **cannot use this optimization**
-      // the reason is that one of the input ciphertexts (blockIdx.z)
-      // might skip an iteration while others don't, which as a result
-      // will make that block not call the grid.sync(), causing a deadlock;
-      // maybe it's a workaround to add grid.sync() here, but not sure if
-      // there are any edge cases?
-
-      // continue
-    }
-
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
         Torus, params::opt, params::degree / params::opt>(
@@ -200,7 +168,6 @@ __global__ void device_bootstrap_amortized(
     // Now that the rotation is done, decompose the resulting polynomial
     // coefficients so as to multiply each decomposed level with the
     // corresponding part of the bootstrapping key
-    // TODO (Agnes) explain why we do that for the mask and body separately
     for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
 
       gadget.decompose_one_level(accumulator_mask_decomposed,
@@ -227,8 +194,6 @@ __global__ void device_bootstrap_amortized(
 
       // Get the bootstrapping key piece necessary for the multiplication
       // It is already in the Fourier domain
-      // TODO (Agnes) Explain why for the mask polynomial multiplication
-      // we need the bsk_body_slice and vice versa
       auto bsk_mask_slice = PolynomialFourier<double2, params>(
           get_ith_mask_kth_block(
               bootstrapping_key, iteration, 0, decomp_level,
@@ -241,7 +206,7 @@ __global__ void device_bootstrap_amortized(
       synchronize_threads_in_block();
 
       // Perform the coefficient-wise product with the two pieces of
-      // bootstrapping key TODO (Agnes) why two pieces?
+      // bootstrapping key
       polynomial_product_accumulate_in_fourier_domain(
           mask_res_fft, accumulator_fft, bsk_mask_slice);
       polynomial_product_accumulate_in_fourier_domain(
@@ -333,7 +298,7 @@ __global__ void device_bootstrap_amortized(
   // The blind rotation for this block is over
   // Now we can perform the sample extraction: for the body it's just
   // the resulting constant coefficient of the accumulator
-  // For the mask it's more complicated TODO (Agnes) explain why
+  // For the mask it's more complicated
   sample_extract_mask<Torus, params>(block_lwe_out, accumulator_mask);
   sample_extract_body<Torus, params>(block_lwe_out, accumulator_body);
 }
@@ -380,11 +345,6 @@ __host__ void host_bootstrap_amortized(
   // handles opt polynomial coefficients
   // (actually opt/2 coefficients since we compress the real polynomial into a
   // complex)
-  // TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512
-  //   probably, maybe 1024 would be too big?
-  //   Or would it actually be good in our case to have the largest possible
-  //   number of threads per block since anyway few blocks will run
-  //   concurrently?
   dim3 grid(input_lwe_ciphertext_count, 1, 1);
   dim3 thds(polynomial_size / params::opt, 1, 1);
 
@@ -426,7 +386,6 @@ __host__ void host_bootstrap_amortized(
         device_bootstrap_amortized<Torus, params, FULLSM>,
         cudaFuncAttributeMaxDynamicSharedMemorySize,
         SM_FULL));
-    // TODO (Agnes): is this necessary?
     checkCudaErrors(cudaFuncSetCacheConfig(
         device_bootstrap_amortized<Torus, params, FULLSM>,
         cudaFuncCachePreferShared));
@@ -454,7 +413,6 @@ int cuda_get_pbs_per_gpu(int polynomial_size) {
   int num_threads = polynomial_size / params::opt;
   cudaGetDeviceCount(0);
   cudaDeviceProp device_properties;
-  // FIXME: here we assume every device has same properties
   cudaGetDeviceProperties(&device_properties, 0);
   cudaOccupancyMaxActiveBlocksPerMultiprocessor(
       &blocks_per_sm, device_bootstrap_amortized<Torus, params>,
diff --git a/src/bootstrap_low_latency.cu b/src/bootstrap_low_latency.cu
index 7073def4c..b89b754c4 100644
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -48,12 +48,11 @@
  * 		  - switch to the FFT domain
  * 		  - multiply with the bootstrapping key
  * 		  - come back to the coefficients representation
- * 	- between each stage a synchronization of the threads is necessary TODO
- * (Agnes) check this
+ * 	- between each stage a synchronization of the threads is necessary
  * 	- in case the device has enough shared memory, temporary arrays used for
  * the different stages (accumulators) are stored into the shared memory
  * 	- the accumulators serve to combine the results for all decomposition
- * levels TODO (Agnes) check this
+ * levels
  * 	- the constant memory (64K) is used for storing the roots of identity
  * values for the FFT
  */
diff --git a/src/bootstrap_low_latency.cuh b/src/bootstrap_low_latency.cuh
index 5ab608e89..8b845aed5 100644
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -23,8 +23,7 @@
 #include "utils/memory.cuh"
 #include "utils/timer.cuh"
 
-// Cooperative groups are used in the low latency
-// version of the bootstrapping
+// Cooperative groups are used in the low latency PBS
 using namespace cooperative_groups;
 namespace cg = cooperative_groups;
 
@@ -58,11 +57,6 @@ mul_trgsw_trlwe(Torus *accumulator,
   // needed to perform the external product in this block (corresponding to
   // the same decomposition level)
 
-//  auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block(
-//      gpu_num, iteration, blockIdx.y, blockIdx.x);
-//  auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block(
-//      gpu_num, iteration, blockIdx.y, blockIdx.x);
-
     auto bsk_mask_slice = PolynomialFourier<double2, params>(
       get_ith_mask_kth_block(
           bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
@@ -195,7 +189,6 @@ __global__ void device_bootstrap_low_latency(
   // Since the space is L1 cache is small, we use the same memory location for
   // the rotated accumulator and the fft accumulator, since we know that the
   // rotated array is not in use anymore by the time we perform the fft
-
   GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
 
   // Put "b" in [0, 2N[
@@ -222,17 +215,6 @@ __global__ void device_bootstrap_low_latency(
         block_lwe_in[i],
         2 * params::degree); // 2 * params::log2_degree + 1);
 
-    if (a_hat == 0) {
-      // todo(Joao): **cannot use this optimization**
-      // the reason is that one of the input ciphertexts (blockIdx.z)
-      // might skip an iteration while others don't, which as a result
-      // will make that block not call the grid.sync(), causing a deadlock;
-      // maybe it's a workaround to add grid.sync() here, but not sure if
-      // there are any edge cases?
-
-      // continue
-    }
-
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
           Torus, params::opt, params::degree / params::opt>(
@@ -245,8 +227,6 @@ __global__ void device_bootstrap_low_latency(
           params::degree / params::opt>(
           accumulator_rotated, base_log, l_gadget);
 
-
-
     // Decompose the accumulator. Each block gets one level of the
     // decomposition, for the mask and the body (so block 0 will have the
     // accumulator decomposed at level 0, 1 at 1, etc.)
diff --git a/src/bootstrap_wop.cuh b/src/bootstrap_wop.cuh
index b0c9e44ee..11fbe31c2 100644
--- a/src/bootstrap_wop.cuh
+++ b/src/bootstrap_wop.cuh
@@ -337,7 +337,6 @@ void host_cmux_tree(
             device_batch_cmux<Torus, STorus, params, FULLSM>,
             cudaFuncAttributeMaxDynamicSharedMemorySize,
             memory_needed_per_block));
-        // TODO (Agnes): is this necessary?
         checkCudaErrors(cudaFuncSetCacheConfig(
             device_batch_cmux<Torus, STorus, params, FULLSM>,
             cudaFuncCachePreferShared));
diff --git a/src/crypto/bootstrapping_key.cuh b/src/crypto/bootstrapping_key.cuh
index 6741312de..528fa6796 100644
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -83,9 +83,7 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
 
   int gridSize = total_polynomials;
   int blockSize = polynomial_size / choose_opt(polynomial_size);
-  // todo(Joao): let's use cudaMallocHost here,
-  // since it allocates page-staged memory which allows
-  // faster data copy
+
   double2 *h_bsk = (double2 *)malloc(buffer_size);
   double2 *d_bsk;
   cudaMalloc((void **)&d_bsk, buffer_size);
@@ -110,7 +108,6 @@ void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
 
   auto stream = static_cast<cudaStream_t *>(v_stream);
   switch (polynomial_size) {
-    // FIXME (Agnes): check if polynomial sizes are ok
   case 512:
     batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
     <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
diff --git a/src/crypto/torus.cuh b/src/crypto/torus.cuh
index 9dca4cb73..039bb4b50 100644
--- a/src/crypto/torus.cuh
+++ b/src/crypto/torus.cuh
@@ -36,8 +36,6 @@ __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
 template <typename T>
 __device__ __forceinline__ T rescale_torus_element(T element,
                                                    uint32_t log_shift) {
-  // todo(Joao): not sure if this works
-  // return element >> log_shift;
   return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
                (double)log_shift);
 }
diff --git a/src/fft/bnsmfft.cuh b/src/fft/bnsmfft.cuh
index f3117bda8..974b59acb 100644
--- a/src/fft/bnsmfft.cuh
+++ b/src/fft/bnsmfft.cuh
@@ -80,7 +80,7 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
    *  Each thread is always in charge of "opt/2" pairs of coefficients,
    *  which is why we always loop through N/2 by N/opt strides
    *  The pragma unroll instruction tells the compiler to unroll the
-   *  full loop, which should increase performance TODO (Agnes) check this
+   *  full loop, which should increase performance
    */
   bit_reverse_inplace<params>(A);
   __syncthreads();
@@ -113,8 +113,6 @@ template <class params> __device__ void NSMFFT_direct(double2 *A) {
   // between groups of 4 coefficients
   // k=2, \zeta=exp(i pi/4) for even coefficients and
   // exp(3 i pi / 4) for odd coefficients
-  // TODO (Agnes) how does this work on the gpu? aren't we doing
-  // a lot more computations than we should?
   tid = threadIdx.x;
   // odd = 0 for even coefficients, 1 for odd coefficients
   int odd = tid & 1;
@@ -371,7 +369,7 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
    *  Each thread is always in charge of "opt/2" pairs of coefficients,
    *  which is why we always loop through N/2 by N/opt strides
    *  The pragma unroll instruction tells the compiler to unroll the
-   *  full loop, which should increase performance TODO (Agnes) check this
+   *  full loop, which should increase performance
    */
   int tid;
   int i1, i2;
@@ -589,8 +587,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
   // between groups of 4 coefficients
   // k=2, \zeta=exp(i pi/4) for even coefficients and
   // exp(3 i pi / 4) for odd coefficients
-  // TODO (Agnes) how does this work on the gpu? aren't we doing
-  // a lot more computations than we should?
   tid = threadIdx.x;
   // odd = 0 for even coefficients, 1 for odd coefficients
   int odd = tid & 1;
@@ -602,7 +598,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
     i1 = (tid << 1) - odd;
     i2 = i1 + 2;
 
-    // TODO(Beka) optimize twiddle multiplication
     double2 w;
     if (odd) {
       w.x = -0.707106781186547461715008466854;
@@ -629,7 +624,6 @@ template <class params> __device__ void NSMFFT_inverse(double2 *A) {
     // of coefficients, with a stride of 2
     i1 = tid << 1;
     i2 = i1 + 1;
-    // TODO(Beka) optimize twiddle multiplication
     double2 w = {0, -1};
     u = A[i1], v = A[i2];
     A[i1] = (u + v) * 0.5;
diff --git a/src/fft/twiddles.cuh b/src/fft/twiddles.cuh
index c776f974f..7213d3647 100644
--- a/src/fft/twiddles.cuh
+++ b/src/fft/twiddles.cuh
@@ -2,14 +2,6 @@
 #ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
 #define GPU_BOOTSTRAP_TWIDDLES_CUH
 
-// TODO (Agnes) depending on the device architecture
-// can we make more of them __constant__?
-// Do we have to define them all regardless of the
-// polynomial degree and q values?
-
-// TODO (Beka) make those two arrays with dynamic size
-// or find exact maximum for 8192 length poly it shuld
-// be less than 2048
 extern __constant__ short SW1[2048];
 extern __constant__ short SW2[2048];
 
diff --git a/src/keyswitch.cuh b/src/keyswitch.cuh
index 877fb26c2..d3d45d60b 100644
--- a/src/keyswitch.cuh
+++ b/src/keyswitch.cuh
@@ -136,8 +136,6 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_ou
     lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
   }
 
-//  int lwe_size_before =
-//      (lwe_dimension_before + 1) * num_samples;
   int lwe_size_after =
       (lwe_dimension_after + 1) * num_samples;
 
diff --git a/src/polynomial/functions.cuh b/src/polynomial/functions.cuh
index 53988c011..572e38f82 100644
--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -166,7 +166,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
   Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
   int tid = threadIdx.x;
 #pragma unroll
-  // TODO (Beka) check if better memory access is possible
   for (int i = 0; i < params::opt / 2; i++) {
     double v1 = m_values[tid].x;
     double v2 = m_values[tid].y;
@@ -194,8 +193,6 @@ __device__ void add_to_torus(double2 *m_values, Torus *result) {
 template <typename Torus, class params>
 __device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) {
   // Set first coefficient of the accumulator as the body of the LWE sample
-  // todo(Joao): not every thread needs to set it
-  // if (threadIdx.x == 0)
   lwe_out[params::degree] = accumulator[0];
 }
 
diff --git a/src/polynomial/polynomial.cuh b/src/polynomial/polynomial.cuh
index b1339c634..5606e093a 100644
--- a/src/polynomial/polynomial.cuh
+++ b/src/polynomial/polynomial.cuh
@@ -50,8 +50,7 @@ public:
                                                    int chunk_size) {
     int pos = chunk_num * chunk_size;
     T *ptr = &m_data[pos];
-    // todo(Joao): unsafe, user must pass chunk that has size multiple of
-    // polynomial degree
+
     return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
   }
 
@@ -88,8 +87,6 @@ public:
     synchronize_threads_in_block();
   }
 
-  // todo(Joao): we need to make these APIs more clear, as it's confusing what's
-  // being copied where
   __device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
                                            int i) {
     int tid = threadIdx.x;
@@ -160,22 +157,6 @@ public:
     }
   }
 
-  /*
-  __device__ void add_polynomial_inplace(PolynomialFourier<T, params> &source,
-                                         int begin) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      this->m_values[tid] += source.m_values[tid + begin];
-      tid = tid + params::degree / params::opt;
-    }
-    if (threadIdx.x == 0) {
-      this->m_values[params::degree / 2] += source.m_values[params::degree / 2 +
-begin];
-    }
-  }
-  */
-
   __device__ void swap_quarters_inplace() {
     int tid = threadIdx.x;
     int s1 = params::quarter;
@@ -202,20 +183,6 @@ begin];
     }
   }
 
-  __device__ void
-  forward_negacyclic_fft_inplace(PolynomialFourier<double2, params> &X) {
-    // TODO function should be removed
-  }
-
-  __device__ void inverse_negacyclic_fft_inplace() {
-    // TODO function should be removed
-  }
-
-  template <typename Torus>
-  __device__ void add_to_torus(Polynomial<Torus, params> &result) {
-    // TODO function should be removed
-  }
-
   __device__ T &operator[](int i) { return m_values[i]; }
 };
 
@@ -474,18 +441,6 @@ public:
     }
   }
 
-  /*
-  __device__ void add_polynomial_inplace(Polynomial<T, params> &source,
-                                         int begin) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      this->coefficients[tid] += source.coefficients[tid + begin];
-      tid = tid + params::degree / params::opt;
-    }
-  }
-  */
-
   __device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
     int tid = threadIdx.x;
     const int grid_dim = blockDim.x;
@@ -543,11 +498,6 @@ public:
     synchronize_threads_in_block();
   }
 
-  template <typename FT>
-  __device__ void
-  forward_negacyclic_fft_half(PolynomialFourier<FT, params> &result) {
-    // TODO function should be removed
-  }
 };
 template <typename T, class params> class Vector {
 public:
@@ -624,7 +574,6 @@ public:
 
   __device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
 
-  // todo(Joao): let's do coalesced access here at some point
   __device__ void operator-=(const Vector<T, params> &rhs) {
     assert(m_size == rhs->m_size);
     int tid = threadIdx.x;