feat(cuda): introduce cuda acceleration for the pbs and keyswitch

- a new crate concrete-cuda is added to the repository, containing some Cuda implementations for the bootstrap and keyswitch and a Rust wrapping to call them - a new backend_cuda is added to concrete-core, with dedicated entities whose memory is located on the GPU and engines that call the Cuda accelerated functions
2026-02-10 04:35:03 -05:00 · 2021-10-20 09:34:57 +02:00
commit 64521f6747
37 changed files with 15143 additions and 0 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(SOURCES ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h 
+    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h)
+file(GLOB SOURCES
+     "*.cu"
+     "*.h"
+     "fft/*.cu")
+add_library(concrete_cuda STATIC ${SOURCES})
+set_target_properties(concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
+target_link_libraries(concrete_cuda PUBLIC cudart)
+target_include_directories(concrete_cuda PRIVATE .)
--- a/src/bootstrap.cu
+++ b/src/bootstrap.cu
@@ -0,0 +1 @@
+#include "crypto/bootstrapping_key.cuh"
--- a/src/bootstrap_amortized.cu
+++ b/src/bootstrap_amortized.cu
@@ -0,0 +1,174 @@
+#include "bootstrap_amortized.cuh"
+
+/* Perform bootstrapping on a batch of input LWE ciphertexts
+ *
+ *  - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many test vectors of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_lut_vectors vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which test vector to use for each sample in
+ * lut_vector
+ *  - lwe_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: RGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the test vector
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * Note: it is necessary to generate (k+1).k.l.N.n
+ * uniformly random coefficients for the zero encryptions
+ * - input_lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - polynomial_size: size of the test polynomial (test vector) and size of the
+ * GLWE polynomial (~1024)
+ * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ * - l_gadget: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ * - num_lut_vectors: parameter to set the actual number of test vectors to be
+ * used
+ * - q: number of bytes in the integer representation (32 or 64)
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples blocks of threads are launched, where each thread is going
+ * to handle one or more polynomial coefficients at each stage:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- decompose into l_gadget levels, then for each level:
+ * 		  - switch to the FFT domain
+ * 		  - multiply with the bootstrapping key
+ * 		  - come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
+
+void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
+    void *v_stream,
+    void *lwe_out,
+    void *lut_vector,
+    void *lut_vector_indexes,
+    void *lwe_in,
+    void *bootstrapping_key,
+    uint32_t input_lwe_dimension,
+    uint32_t polynomial_size,
+    uint32_t base_log,
+    uint32_t l_gadget,
+    uint32_t num_samples,
+    uint32_t num_lut_vectors,
+    uint32_t lwe_idx,
+    uint32_t max_shared_memory) {
+
+  switch (polynomial_size) {
+  case 512:
+    host_bootstrap_amortized<uint32_t, Degree<512>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+      break;
+  case 1024:
+    host_bootstrap_amortized<uint32_t, Degree<1024>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  case 2048:
+    host_bootstrap_amortized<uint32_t, Degree<2048>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  case 4096:
+    host_bootstrap_amortized<uint32_t, Degree<4096>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  case 8192:
+    host_bootstrap_amortized<uint32_t, Degree<8192>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  default:
+    break;
+  }
+}
+
+void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
+    void *v_stream,
+    void *lwe_out,
+    void *lut_vector,
+    void *lut_vector_indexes,
+    void *lwe_in,
+    void *bootstrapping_key,
+    uint32_t input_lwe_dimension,
+    uint32_t polynomial_size,
+    uint32_t base_log,
+    uint32_t l_gadget,
+    uint32_t num_samples,
+    uint32_t num_lut_vectors,
+    uint32_t lwe_idx,
+    uint32_t max_shared_memory) {
+
+  switch (polynomial_size) {
+  case 512:
+    host_bootstrap_amortized<uint64_t, Degree<512>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  case 1024:
+    host_bootstrap_amortized<uint64_t, Degree<1024>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  case 2048:
+    host_bootstrap_amortized<uint64_t, Degree<2048>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  case 4096:
+    host_bootstrap_amortized<uint64_t, Degree<4096>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  case 8192:
+    host_bootstrap_amortized<uint64_t, Degree<8192>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, input_lwe_dimension, polynomial_size, base_log, l_gadget, num_samples,
+        num_lut_vectors, lwe_idx, max_shared_memory);
+    break;
+  default:
+    break;
+  }
+}
--- a/src/bootstrap_amortized.cuh
+++ b/src/bootstrap_amortized.cuh
@@ -0,0 +1,466 @@
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#endif
+
+#ifndef CNCRT_AMORTIZED_PBS_H
+#define CNCRT_AMORTIZED_PBS_H
+
+#include "cooperative_groups.h"
+
+#include "../include/helper_cuda.h"
+#include "bootstrap.h"
+#include "complex/operations.cuh"
+//#include "crypto/bootstrapping_key.cuh"
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "fft/bnsmfft.cuh"
+#include "fft/smfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "utils/memory.cuh"
+#include "utils/timer.cuh"
+
+template <typename Torus, class params, sharedMemDegree SMD>
+/*
+ * Kernel launched by host_bootstrap_amortized
+ *
+ * Uses shared memory to increase performance
+ *  - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many test vectors of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_lut_vectors vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to which test vector
+ * to use for each sample in lut_vector
+ *  - lwe_in: input batch of num_samples LWE ciphertexts, containing n mask
+ * values + 1 body value
+ *  - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
+ * key sk2
+ *  - device_mem: pointer to the device's global memory in case we use it (SMD
+ * == NOSM or PARTIALSM)
+ *  - lwe_mask_size: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ *  - polynomial_size: size of the test polynomial (test vector) and size of the
+ * GLWE polynomial (~1024)
+ *  - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ *  - l_gadget: number of decomposition levels in the gadget matrix (~4)
+ *  - gpu_num: index of the current GPU (useful for multi-GPU computations)
+ *  - lwe_idx: equal to the number of samples per gpu x gpu_num
+ *  - device_memory_size_per_sample: amount of global memory to allocate if SMD
+ * is not FULLSM
+ */
+__global__ void device_bootstrap_amortized(
+    Torus *lwe_out,
+    Torus *lut_vector,
+    uint32_t *lut_vector_indexes,
+    Torus *lwe_in,
+    double2 *bootstrapping_key,
+    char *device_mem,
+    uint32_t lwe_mask_size,
+    uint32_t polynomial_size,
+    uint32_t base_log,
+    uint32_t l_gadget,
+    uint32_t lwe_idx,
+    size_t device_memory_size_per_sample) {
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ char sharedmem[];
+  char *selected_memory;
+
+  if constexpr (SMD == FULLSM)
+    selected_memory = sharedmem;
+  else
+    selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
+
+  // For GPU bootstrapping the RLWE dimension is hard-set to 1: there is only
+  // one mask polynomial and 1 body to handle Also, since the decomposed
+  // polynomials take coefficients between -B/2 and B/2 they can be represented
+  // with only 16 bits, assuming the base log does not exceed 2^16
+  int16_t *accumulator_mask_decomposed = (int16_t *)selected_memory;
+  // TODO (Agnes) why not the 16 bits representation here?
+  int16_t *accumulator_body_decomposed =
+      (int16_t *)accumulator_mask_decomposed + polynomial_size;
+  Torus *accumulator_mask = (Torus *)accumulator_body_decomposed +
+                            polynomial_size / (sizeof(Torus) / sizeof(int16_t));
+  Torus *accumulator_body =
+      (Torus *)accumulator_mask + (ptrdiff_t)polynomial_size;
+  Torus *accumulator_mask_rotated =
+      (Torus *)accumulator_body + (ptrdiff_t)polynomial_size;
+  Torus *accumulator_body_rotated =
+      (Torus *)accumulator_mask_rotated + (ptrdiff_t)polynomial_size;
+  double2 *mask_res_fft = (double2 *)accumulator_body_rotated +
+                          polynomial_size / (sizeof(double2) / sizeof(Torus));
+  double2 *body_res_fft =
+      (double2 *)mask_res_fft + (ptrdiff_t)polynomial_size / 2;
+  double2 *accumulator_fft = (double2 *)sharedmem;
+  if constexpr (SMD != PARTIALSM)
+    accumulator_fft =
+        (double2 *)body_res_fft + (ptrdiff_t)(polynomial_size / 2);
+
+  /*
+  int dif0 = ((char*)accumulator_body_decomposed - (char*)selected_memory);
+  int dif1 = ((char*)accumulator_mask - (char*)accumulator_body_decomposed);
+  int dif2 = ((char*)accumulator_body - (char*)accumulator_mask);
+  int dif3 = ((char*)accumulator_mask_rotated - (char*)accumulator_body);
+  int dif4 = ((char*)accumulator_body_rotated -
+  (char*)accumulator_mask_rotated); int dif5 = ((char*)mask_res_fft -
+  (char*)accumulator_body_rotated); int dif6 = ((char*)body_res_fft -
+  (char*)mask_res_fft); int dif7 =  (SMD != PARTIALSM)? (char*)accumulator_fft -
+  (char*)body_res_fft:0; if (threadIdx.x == 0 && blockIdx.x == 0) {
+    printf("device and shared mem: %d %d %d %d %d %d %d %d\n ",dif0, dif1, dif2,
+  dif3, dif4, dif5, dif6, dif7);
+  }
+  */
+
+  auto block_lwe_in = &lwe_in[blockIdx.x * (lwe_mask_size + 1)];
+  Torus *block_lut_vector =
+      &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * 2];
+
+  // TODO (Agnes) try to store the gadget matrix in const memory to see if
+  // register use decreases Since all const mem is used for twiddles currently,
+  // it would mean moving some of them to global memory instead
+  GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
+
+  // Put "b", the body, in [0, 2N[
+  Torus b_hat = rescale_torus_element(
+      block_lwe_in[lwe_mask_size],
+      2 * params::degree); // 2 * params::log2_degree + 1);
+
+  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+      params::degree / params::opt>(
+      accumulator_mask, block_lut_vector, b_hat, false);
+
+  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+      params::degree / params::opt>(
+      accumulator_body, &block_lut_vector[params::degree], b_hat, false);
+
+  // Loop over all the mask elements of the sample to accumulate
+  // (X^a_i-1) multiplication, decomposition of the resulting polynomial
+  // into l_gadget polynomials, and performing polynomial multiplication
+  // via an FFT with the RGSW encrypted secret key
+  for (int iteration = 0; iteration < lwe_mask_size; iteration++) {
+    // TODO make sure that following sync is necessary
+    synchronize_threads_in_block();
+
+    // Put "a" in [0, 2N[ instead of Zq
+    Torus a_hat = rescale_torus_element(
+        block_lwe_in[iteration],
+        2 * params::degree); // 2 * params::log2_degree + 1);
+
+    // TODO (Agnes) why is there this if condition?
+    if (a_hat == 0) {
+      // todo(Joao): **cannot use this optimization**
+      // the reason is that one of the input ciphertexts (blockIdx.z)
+      // might skip an iteration while others don't, which as a result
+      // will make that block not call the grid.sync(), causing a deadlock;
+      // maybe it's a workaround to add grid.sync() here, but not sure if
+      // there are any edge cases?
+
+      // continue
+    }
+
+    // Perform ACC * (X^ä - 1)
+    multiply_by_monomial_negacyclic_and_sub_polynomial<
+        Torus, params::opt, params::degree / params::opt>(
+        accumulator_mask, accumulator_mask_rotated, a_hat);
+
+    multiply_by_monomial_negacyclic_and_sub_polynomial<
+        Torus, params::opt, params::degree / params::opt>(
+        accumulator_body, accumulator_body_rotated, a_hat);
+
+    synchronize_threads_in_block();
+
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    round_to_closest_multiple_inplace<Torus, params::opt,
+        params::degree / params::opt>(
+        accumulator_mask_rotated, base_log, l_gadget);
+
+    round_to_closest_multiple_inplace<Torus, params::opt,
+        params::degree / params::opt>(
+        accumulator_body_rotated, base_log, l_gadget);
+    // Initialize the polynomial multiplication via FFT arrays
+    // The polynomial multiplications happens at the block level
+    // and each thread handles two or more coefficients
+    int pos = threadIdx.x;
+    for (int j = 0; j < params::opt / 2; j++) {
+      mask_res_fft[pos].x = 0;
+      mask_res_fft[pos].y = 0;
+      body_res_fft[pos].x = 0;
+      body_res_fft[pos].y = 0;
+      pos += params::degree / params::opt;
+    }
+
+    // Now that the rotation is done, decompose the resulting polynomial
+    // coefficients so as to multiply each decomposed level with the
+    // corresponding part of the bootstrapping key
+    // TODO (Agnes) explain why we do that for the mask and body separately
+    for (int decomp_level = 0; decomp_level < l_gadget; decomp_level++) {
+
+      gadget.decompose_one_level(accumulator_mask_decomposed,
+                                 accumulator_mask_rotated, decomp_level);
+
+      gadget.decompose_one_level(accumulator_body_decomposed,
+                                 accumulator_body_rotated, decomp_level);
+
+      synchronize_threads_in_block();
+
+      // First, perform the polynomial multiplication for the mask
+
+      // Reduce the size of the FFT to be performed by storing
+      // the real-valued polynomial into a complex polynomial
+      real_to_complex_compressed<int16_t, params>(accumulator_mask_decomposed,
+                                                  accumulator_fft);
+
+      synchronize_threads_in_block();
+      // Switch to the FFT space
+      NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
+      synchronize_threads_in_block();
+
+      correction_direct_fft_inplace<params>(accumulator_fft);
+
+      // Get the bootstrapping key piece necessary for the multiplication
+      // It is already in the Fourier domain
+      // TODO (Agnes) Explain why for the mask polynomial multiplication
+      // we need the bsk_body_slice and vice versa
+      auto bsk_mask_slice = PolynomialFourier<double2, params>(
+          get_ith_mask_kth_block(
+              bootstrapping_key, iteration, 0, decomp_level,
+              polynomial_size, 1, l_gadget));
+      auto bsk_body_slice = PolynomialFourier<double2, params>(
+          get_ith_body_kth_block(
+              bootstrapping_key, iteration, 0, decomp_level,
+              polynomial_size, 1, l_gadget));
+
+      synchronize_threads_in_block();
+
+      // Perform the coefficient-wise product with the two pieces of
+      // bootstrapping key TODO (Agnes) why two pieces?
+      polynomial_product_accumulate_in_fourier_domain(
+          mask_res_fft, accumulator_fft, bsk_mask_slice);
+      polynomial_product_accumulate_in_fourier_domain(
+          body_res_fft, accumulator_fft, bsk_body_slice);
+
+      synchronize_threads_in_block();
+
+      // Now handle the polynomial multiplication for the body
+      // in the same way
+      real_to_complex_compressed<int16_t, params>(accumulator_body_decomposed,
+                                                  accumulator_fft);
+      synchronize_threads_in_block();
+
+      NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
+      synchronize_threads_in_block();
+
+      correction_direct_fft_inplace<params>(accumulator_fft);
+
+      auto bsk_mask_slice_2 = PolynomialFourier<double2, params>(
+          get_ith_mask_kth_block(bootstrapping_key, iteration, 1, decomp_level,
+                                 polynomial_size, 1, l_gadget));
+      auto bsk_body_slice_2 = PolynomialFourier<double2, params>(
+          get_ith_body_kth_block(bootstrapping_key, iteration, 1, decomp_level,
+                                     polynomial_size, 1, l_gadget));
+
+      synchronize_threads_in_block();
+
+      polynomial_product_accumulate_in_fourier_domain(
+          mask_res_fft, accumulator_fft, bsk_mask_slice_2);
+      polynomial_product_accumulate_in_fourier_domain(
+          body_res_fft, accumulator_fft, bsk_body_slice_2);
+    }
+
+    // Come back to the coefficient representation
+    if constexpr (SMD == FULLSM || SMD == NOSM) {
+      synchronize_threads_in_block();
+
+      correction_inverse_fft_inplace<params>(mask_res_fft);
+      correction_inverse_fft_inplace<params>(body_res_fft);
+      synchronize_threads_in_block();
+
+      NSMFFT_inverse<HalfDegree<params>>(mask_res_fft);
+      NSMFFT_inverse<HalfDegree<params>>(body_res_fft);
+
+      synchronize_threads_in_block();
+
+      add_to_torus<Torus, params>(mask_res_fft, accumulator_mask);
+      add_to_torus<Torus, params>(body_res_fft, accumulator_body);
+      synchronize_threads_in_block();
+    } else {
+      int tid = threadIdx.x;
+#pragma unroll
+      for (int i = 0; i < params::opt / 2; i++) {
+        accumulator_fft[tid] = mask_res_fft[tid];
+        tid = tid + params::degree / params::opt;
+      }
+      synchronize_threads_in_block();
+
+      correction_inverse_fft_inplace<params>(accumulator_fft);
+      synchronize_threads_in_block();
+
+      NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
+      synchronize_threads_in_block();
+
+      add_to_torus<Torus, params>(accumulator_fft, accumulator_mask);
+      synchronize_threads_in_block();
+
+      tid = threadIdx.x;
+#pragma unroll
+      for (int i = 0; i < params::opt / 2; i++) {
+        accumulator_fft[tid] = body_res_fft[tid];
+        tid = tid + params::degree / params::opt;
+      }
+      synchronize_threads_in_block();
+
+      correction_inverse_fft_inplace<params>(accumulator_fft);
+      synchronize_threads_in_block();
+
+      NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
+      synchronize_threads_in_block();
+
+      add_to_torus<Torus, params>(accumulator_fft, accumulator_body);
+      synchronize_threads_in_block();
+    }
+  }
+
+  auto block_lwe_out = &lwe_out[blockIdx.x * (polynomial_size + 1)];
+
+  // The blind rotation for this block is over
+  // Now we can perform the sample extraction: for the body it's just
+  // the resulting constant coefficient of the accumulator
+  // For the mask it's more complicated TODO (Agnes) explain why
+  sample_extract_mask<Torus, params>(block_lwe_out, accumulator_mask);
+  sample_extract_body<Torus, params>(block_lwe_out, accumulator_body);
+}
+
+template <typename Torus, class params>
+__host__ void host_bootstrap_amortized(
+    void *v_stream,
+    Torus *lwe_out,
+    Torus *lut_vector,
+    uint32_t *lut_vector_indexes,
+    Torus *lwe_in,
+    double2 *bootstrapping_key,
+    uint32_t input_lwe_dimension,
+    uint32_t polynomial_size,
+    uint32_t base_log,
+    uint32_t l_gadget,
+    uint32_t input_lwe_ciphertext_count,
+    uint32_t num_lut_vectors,
+    uint32_t lwe_idx,
+    uint32_t max_shared_memory) {
+
+  int SM_FULL = sizeof(Torus) * polynomial_size +   // accumulator mask
+                sizeof(Torus) * polynomial_size +   // accumulator body
+                sizeof(Torus) * polynomial_size +   // accumulator mask rotated
+                sizeof(Torus) * polynomial_size +   // accumulator body rotated
+                sizeof(int16_t) * polynomial_size + // accumulator_dec mask
+                sizeof(int16_t) * polynomial_size + // accumulator_dec_body
+                sizeof(double2) * polynomial_size / 2 + // accumulator fft mask
+                sizeof(double2) * polynomial_size / 2 + // accumulator fft body
+                sizeof(double2) * polynomial_size / 2;  // calculate buffer fft
+
+  int SM_PART = sizeof(double2) * polynomial_size / 2; // calculate buffer fft
+
+  int DM_PART = SM_FULL - SM_PART;
+
+  int DM_FULL = SM_FULL;
+
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+
+  char *d_mem;
+
+  // Create a 1-dimensional grid of threads
+  // where each block handles 1 sample and each thread
+  // handles opt polynomial coefficients
+  // (actually opt/2 coefficients since we compress the real polynomial into a
+  // complex)
+  // TODO (Agnes) Polynomial size / params::opt should be equal to 256 or 512
+  //   probably, maybe 1024 would be too big?
+  //   Or would it actually be good in our case to have the largest possible
+  //   number of threads per block since anyway few blocks will run
+  //   concurrently?
+  dim3 grid(input_lwe_ciphertext_count, 1, 1);
+  dim3 thds(polynomial_size / params::opt, 1, 1);
+
+  // Launch the kernel using polynomial_size/opt threads
+  // where each thread computes opt polynomial coefficients
+  // Depending on the required amount of shared memory, choose
+  // from one of three templates (no use, partial use or full use
+  // of shared memory)
+  if (max_shared_memory < SM_PART) {
+    checkCudaErrors(cudaMalloc((void **)&d_mem, DM_FULL * input_lwe_ciphertext_count));
+    device_bootstrap_amortized<Torus, params, NOSM>
+    <<<grid, thds, 0, *stream>>>(
+        lwe_out, lut_vector, lut_vector_indexes, lwe_in,
+        bootstrapping_key, d_mem,
+        input_lwe_dimension, polynomial_size,
+        base_log, l_gadget, lwe_idx, DM_FULL);
+  } else if (max_shared_memory < SM_FULL) {
+    cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         SM_PART);
+    cudaFuncSetCacheConfig(
+        device_bootstrap_amortized<Torus, params, PARTIALSM>,
+        cudaFuncCachePreferShared);
+    checkCudaErrors(cudaMalloc((void **)&d_mem, DM_PART * input_lwe_ciphertext_count));
+    device_bootstrap_amortized<Torus, params, PARTIALSM>
+    <<<grid, thds, SM_PART, *stream>>>(
+        lwe_out, lut_vector, lut_vector_indexes,
+        lwe_in, bootstrapping_key,
+        d_mem, input_lwe_dimension, polynomial_size,
+        base_log, l_gadget, lwe_idx,
+        DM_PART);
+  } else {
+    // For devices with compute capability 7.x a single thread block can
+    // address the full capacity of shared memory. Shared memory on the
+    // device then has to be allocated dynamically.
+    // For lower compute capabilities, this call
+    // just does nothing and the amount of shared memory used is 48 KB
+    checkCudaErrors(cudaFuncSetAttribute(
+        device_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        SM_FULL));
+    // TODO (Agnes): is this necessary?
+    checkCudaErrors(cudaFuncSetCacheConfig(
+        device_bootstrap_amortized<Torus, params, FULLSM>,
+        cudaFuncCachePreferShared));
+    checkCudaErrors(cudaMalloc((void **)&d_mem, 0));
+
+    device_bootstrap_amortized<Torus, params, FULLSM>
+    <<<grid, thds, SM_FULL, *stream>>>(
+        lwe_out, lut_vector, lut_vector_indexes,
+        lwe_in, bootstrapping_key,
+        d_mem, input_lwe_dimension, polynomial_size,
+        base_log, l_gadget, lwe_idx,
+        0);
+  }
+  // Synchronize the streams before copying the result to lwe_out at the right
+  // place
+  cudaStreamSynchronize(*stream);
+  cudaFree(d_mem);
+
+}
+
+template <typename Torus, class params>
+int cuda_get_pbs_per_gpu(int polynomial_size) {
+
+  int blocks_per_sm = 0;
+  int num_threads = polynomial_size / params::opt;
+  cudaGetDeviceCount(0);
+  cudaDeviceProp device_properties;
+  // FIXME: here we assume every device has same properties
+  cudaGetDeviceProperties(&device_properties, 0);
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, device_bootstrap_amortized<Torus, params>,
+          num_threads, 0);
+
+  return device_properties.multiProcessorCount * blocks_per_sm;
+}
+
+#endif // CNCRT_PBS_H
--- a/src/bootstrap_low_latency.cu
+++ b/src/bootstrap_low_latency.cu
@@ -0,0 +1,183 @@
+#include "bootstrap_low_latency.cuh"
+
+/* Perform bootstrapping on a batch of input LWE ciphertexts
+ *
+ *  - lwe_out: output batch of num_samples bootstrapped ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lut_vector: should hold as many test vectors of size polynomial_size
+ * as there are input ciphertexts, but actually holds
+ * num_lut_vectors vectors to reduce memory usage
+ *  - lut_vector_indexes: stores the index corresponding to
+ * which test vector to use for each sample in
+ * lut_vector
+ *  - lwe_in: input batch of num_samples LWE ciphertexts, containing n
+ * mask values + 1 body value
+ *  - bootstrapping_key: RGSW encryption of the LWE secret key sk1
+ * under secret key sk2
+ * bsk = Z + sk1 H
+ * where H is the gadget matrix and Z is a matrix (k+1).l
+ * containing GLWE encryptions of 0 under sk2.
+ * bsk is thus a tensor of size (k+1)^2.l.N.n
+ * where l is the number of decomposition levels and
+ * k is the GLWE dimension, N is the polynomial size for
+ * GLWE. The polynomial size for GLWE and the test vector
+ * are the same because they have to be in the same ring
+ * to be multiplied.
+ * Note: it is necessary to generate (k+1).k.l.N.n
+ * uniformly random coefficients for the zero encryptions
+ * - lwe_dimension: size of the Torus vector used to encrypt the input
+ * LWE ciphertexts - referred to as n above (~ 600)
+ * - polynomial_size: size of the test polynomial (test vector) and size of the
+ * GLWE polynomial (~1024)
+ * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
+ * - l_gadget: number of decomposition levels in the gadget matrix (~4)
+ * - num_samples: number of encrypted input messages
+ * - num_lut_vectors: parameter to set the actual number of test vectors to be
+ * used
+ * - q: number of bytes in the integer representation (32 or 64)
+ *
+ * This function calls a wrapper to a device kernel that performs the
+ * bootstrapping:
+ * 	- the kernel is templatized based on integer discretization and
+ * polynomial degree
+ * 	- num_samples blocks of threads are launched, where each thread is going
+ * to handle one or more polynomial coefficients at each stage:
+ * 		- perform the blind rotation
+ * 		- round the result
+ * 		- decompose into l_gadget levels, then for each level:
+ * 		  - switch to the FFT domain
+ * 		  - multiply with the bootstrapping key
+ * 		  - come back to the coefficients representation
+ * 	- between each stage a synchronization of the threads is necessary TODO
+ * (Agnes) check this
+ * 	- in case the device has enough shared memory, temporary arrays used for
+ * the different stages (accumulators) are stored into the shared memory
+ * 	- the accumulators serve to combine the results for all decomposition
+ * levels TODO (Agnes) check this
+ * 	- the constant memory (64K) is used for storing the roots of identity
+ * values for the FFT
+ */
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
+        void *v_stream,
+        void *lwe_out,
+        void *lut_vector,
+        void *lut_vector_indexes,
+        void *lwe_in,
+        void *bootstrapping_key,
+        uint32_t lwe_dimension,
+        uint32_t polynomial_size,
+        uint32_t base_log,
+        uint32_t l_gadget,
+        uint32_t num_samples,
+        uint32_t num_lut_vectors,
+        uint32_t lwe_idx,
+        uint32_t max_shared_memory) {
+
+  switch (polynomial_size) {
+  case 512:
+    host_bootstrap_low_latency<uint32_t, Degree<512>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  case 1024:
+    host_bootstrap_low_latency<uint32_t, Degree<1024>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  case 2048:
+    host_bootstrap_low_latency<uint32_t, Degree<2048>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  case 4096:
+    host_bootstrap_low_latency<uint32_t, Degree<4096>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  case 8192:
+    host_bootstrap_low_latency<uint32_t, Degree<8192>>(
+        v_stream, (uint32_t *)lwe_out, (uint32_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  default:
+    break;
+  }
+}
+
+void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
+        void *v_stream,
+        void *lwe_out,
+        void *lut_vector,
+        void *lut_vector_indexes,
+        void *lwe_in,
+        void *bootstrapping_key,
+        uint32_t lwe_dimension,
+        uint32_t polynomial_size,
+        uint32_t base_log,
+        uint32_t l_gadget,
+        uint32_t num_samples,
+        uint32_t num_lut_vectors,
+        uint32_t lwe_idx,
+        uint32_t max_shared_memory) {
+
+  switch (polynomial_size) {
+  case 512:
+    host_bootstrap_low_latency<uint64_t, Degree<512>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  case 1024:
+    host_bootstrap_low_latency<uint64_t, Degree<1024>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  case 2048:
+    host_bootstrap_low_latency<uint64_t, Degree<2048>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  case 4096:
+    host_bootstrap_low_latency<uint64_t, Degree<4096>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  case 8192:
+    host_bootstrap_low_latency<uint64_t, Degree<8192>>(
+        v_stream, (uint64_t *)lwe_out, (uint64_t *)lut_vector,
+        (uint32_t *)lut_vector_indexes, (uint64_t *)lwe_in,
+        (double2 *)bootstrapping_key, lwe_dimension, polynomial_size,
+        base_log, l_gadget, num_samples,
+        num_lut_vectors);
+    break;
+  default:
+    break;
+  }
+}
+
--- a/src/bootstrap_low_latency.cuh
+++ b/src/bootstrap_low_latency.cuh
@@ -0,0 +1,348 @@
+#ifdef __CDT_PARSER__
+#undef __CUDA_RUNTIME_H__
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#endif
+
+#ifndef LOWLAT_PBS_H
+#define LOWLAT_PBS_H
+
+#include "cooperative_groups.h"
+
+#include "../include/helper_cuda.h"
+#include "bootstrap.h"
+#include "complex/operations.cuh"
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "fft/bnsmfft.cuh"
+#include "fft/smfft.cuh"
+#include "fft/twiddles.cuh"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial.cuh"
+#include "polynomial/polynomial_math.cuh"
+#include "utils/memory.cuh"
+#include "utils/timer.cuh"
+
+// Cooperative groups are used in the low latency
+// version of the bootstrapping
+using namespace cooperative_groups;
+namespace cg = cooperative_groups;
+
+template <typename Torus, class params>
+__device__ void
+mul_trgsw_trlwe(Torus *accumulator,
+                double2 *fft,
+                int16_t *trlwe_decomposed,
+                double2 *mask_join_buffer,
+                double2 *body_join_buffer,
+                double2 *bootstrapping_key,
+                int polynomial_size, int l_gadget, int iteration, grid_group &grid) {
+
+  // Put the decomposed TRLWE sample in the Fourier domain
+  real_to_complex_compressed<int16_t, params>(trlwe_decomposed,
+                                              fft);
+  synchronize_threads_in_block();
+
+  // Switch to the FFT space
+  NSMFFT_direct<HalfDegree<params>>(fft);
+  synchronize_threads_in_block();
+
+  correction_direct_fft_inplace<params>(fft);
+  synchronize_threads_in_block();
+
+
+
+  // Get the pieces of the bootstrapping key that will be needed for the
+  // external product; blockIdx.x is the ID of the block that's executing
+  // this function, so we end up getting the lines of the bootstrapping key
+  // needed to perform the external product in this block (corresponding to
+  // the same decomposition level)
+
+//  auto bsk_mask_slice = bootstrapping_key.get_ith_mask_kth_block(
+//      gpu_num, iteration, blockIdx.y, blockIdx.x);
+//  auto bsk_body_slice = bootstrapping_key.get_ith_body_kth_block(
+//      gpu_num, iteration, blockIdx.y, blockIdx.x);
+
+    auto bsk_mask_slice = PolynomialFourier<double2, params>(
+      get_ith_mask_kth_block(
+          bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
+          polynomial_size, 1, l_gadget));
+    auto bsk_body_slice = PolynomialFourier<double2, params>(
+      get_ith_body_kth_block(
+          bootstrapping_key, iteration, blockIdx.y, blockIdx.x,
+          polynomial_size, 1, l_gadget));
+
+  // Perform the matrix multiplication between the RGSW and the TRLWE,
+  // each block operating on a single level for mask and body
+
+  auto first_processed_bsk = (blockIdx.y == 0) ? bsk_mask_slice : bsk_body_slice;
+  auto second_processed_bsk = (blockIdx.y == 0) ? bsk_body_slice : bsk_mask_slice;
+
+  auto first_processed_acc = (blockIdx.y == 0) ?
+      &mask_join_buffer[params::degree / 2 * blockIdx.x] :
+      &body_join_buffer[params::degree / 2 * blockIdx.x];
+  auto second_processed_acc = (blockIdx.y == 0) ?
+      &body_join_buffer[params::degree / 2 * blockIdx.x] :
+      &mask_join_buffer[params::degree / 2 * blockIdx.x];
+
+  int tid = threadIdx.x;
+
+  //first product
+  for(int i = 0; i < params::opt / 2; i++) {
+      first_processed_acc[tid] = fft[tid] * first_processed_bsk.m_values[tid];
+      tid += params::degree / params::opt;
+  }
+
+  grid.sync();
+  tid = threadIdx.x;
+  //second product
+    for(int i = 0; i < params::opt / 2; i++) {
+        second_processed_acc[tid] += fft[tid] * second_processed_bsk.m_values[tid];
+        tid += params::degree / params::opt;
+    }
+
+
+  // -----------------------------------------------------------------
+
+  // All blocks are synchronized here; after this sync, *_join_buffer has the
+  // values needed from every other block
+  grid.sync();
+
+  auto src_acc =  (blockIdx.y == 0) ? mask_join_buffer : body_join_buffer;
+
+  // copy first product into fft buffer
+  tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+      fft[tid] = src_acc[tid];
+      tid += params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+
+  // accumulate rest of the products into fft buffer
+  for (int l = 1; l < gridDim.x; l++) {
+      auto cur_src_acc = &src_acc[l * params::degree / 2];
+      tid = threadIdx.x;
+      for (int i = 0; i < params::opt / 2; i++) {
+          fft[tid] += cur_src_acc[tid];
+          tid += params::degree / params::opt;
+      }
+  }
+
+  synchronize_threads_in_block();
+
+  correction_inverse_fft_inplace<params>(fft);
+  synchronize_threads_in_block();
+
+  // Perform the inverse FFT on the result of the RGSWxTRLWE and add to the
+  // accumulator
+  NSMFFT_inverse<HalfDegree<params>>(fft);
+  synchronize_threads_in_block();
+
+  add_to_torus<Torus, params>(fft, accumulator);
+
+  __syncthreads();
+}
+
+template <typename Torus, class params>
+/*
+ * Kernel launched by the low latency version of the
+ * bootstrapping, that uses cooperative groups
+ * lwe_out vector of output lwe s, with length (polynomial_size+1)*num_samples
+ * lut_vector - vector of look up tables with length  polynomial_size * num_samples
+ * lut_vector_indexes - mapping between lwe_in and lut_vector
+ * lwe_in - vector of lwe inputs with length (lwe_mask_size + 1) * num_samples
+ *
+ */
+__global__ void device_bootstrap_low_latency(
+    Torus *lwe_out,
+    Torus *lut_vector,
+    Torus *lwe_in,
+    double2 *bootstrapping_key,
+    double2 *mask_join_buffer,
+    double2 *body_join_buffer,
+    uint32_t lwe_mask_size,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t l_gadget
+    ) {
+
+  grid_group grid = this_grid();
+  
+  // We use shared memory for the polynomials that are used often during the
+  // bootstrap, since shared memory is kept in L1 cache and accessing it is
+  // much faster than global memory
+  extern __shared__ char sharedmem[];
+
+  char* selected_memory = sharedmem;
+
+  int16_t *accumulator_decomposed = (int16_t *)selected_memory;
+  Torus *accumulator = (Torus*)accumulator_decomposed +
+          polynomial_size / (sizeof(Torus) / sizeof(int16_t));
+  double2 *accumulator_fft = (double2*)accumulator +
+          polynomial_size / (sizeof(double2) / sizeof(Torus));
+
+  // Reuse memory from accumulator_fft for accumulator_rotated
+  Torus* accumulator_rotated = (Torus*)accumulator_fft;
+
+  // The third dimension of the block is used to determine on which ciphertext
+  // this block is operating, in the case of batch bootstraps
+  auto block_lwe_in = &lwe_in[blockIdx.z * (lwe_mask_size + 1)];
+
+  auto block_lut_vector =
+          &lut_vector[blockIdx.z * params::degree * 2];
+
+  auto block_mask_join_buffer = &mask_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
+  auto block_body_join_buffer = &body_join_buffer[blockIdx.z * l_gadget * params::degree / 2];
+
+  // Since the space is L1 cache is small, we use the same memory location for
+  // the rotated accumulator and the fft accumulator, since we know that the
+  // rotated array is not in use anymore by the time we perform the fft
+
+  GadgetMatrix<Torus, params> gadget(base_log, l_gadget);
+
+  // Put "b" in [0, 2N[
+  Torus b_hat = rescale_torus_element(
+      block_lwe_in[lwe_mask_size],
+      2 * params::degree);
+
+  if (blockIdx.y == 0) {
+      divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+              params::degree / params::opt>(
+              accumulator, block_lut_vector, b_hat, false);
+  }
+  else {
+      divide_by_monomial_negacyclic_inplace<Torus, params::opt,
+              params::degree / params::opt>(
+              accumulator, &block_lut_vector[params::degree], b_hat, false);
+  }
+
+  for (int i = 0; i < lwe_mask_size; i++) {
+    synchronize_threads_in_block();
+
+    // Put "a" in [0, 2N[
+    Torus a_hat = rescale_torus_element(
+        block_lwe_in[i],
+        2 * params::degree); // 2 * params::log2_degree + 1);
+
+    if (a_hat == 0) {
+      // todo(Joao): **cannot use this optimization**
+      // the reason is that one of the input ciphertexts (blockIdx.z)
+      // might skip an iteration while others don't, which as a result
+      // will make that block not call the grid.sync(), causing a deadlock;
+      // maybe it's a workaround to add grid.sync() here, but not sure if
+      // there are any edge cases?
+
+      // continue
+    }
+
+    // Perform ACC * (X^ä - 1)
+    multiply_by_monomial_negacyclic_and_sub_polynomial<
+          Torus, params::opt, params::degree / params::opt>(
+          accumulator, accumulator_rotated, a_hat);
+
+
+    // Perform a rounding to increase the accuracy of the
+    // bootstrapped ciphertext
+    round_to_closest_multiple_inplace<Torus, params::opt,
+          params::degree / params::opt>(
+          accumulator_rotated, base_log, l_gadget);
+
+
+
+    // Decompose the accumulator. Each block gets one level of the
+    // decomposition, for the mask and the body (so block 0 will have the
+    // accumulator decomposed at level 0, 1 at 1, etc.)
+    gadget.decompose_one_level(accumulator_decomposed, accumulator_rotated,
+                               blockIdx.x);
+
+    // We are using the same memory space for accumulator_fft and
+    // accumulator_rotated, so we need to synchronize here to make sure they
+    // don't modify the same memory space at the same time
+    synchronize_threads_in_block();
+    // Perform G^-1(ACC) * RGSW -> TRLWE
+    mul_trgsw_trlwe<Torus, params>(
+        accumulator,
+        accumulator_fft,
+        accumulator_decomposed,
+        block_mask_join_buffer,
+        block_body_join_buffer,
+        bootstrapping_key,
+        polynomial_size, l_gadget, i, grid);
+  }
+    
+  auto block_lwe_out = &lwe_out[blockIdx.z * (polynomial_size + 1)];
+
+  if (blockIdx.x == 0 && blockIdx.y == 0) {
+    // Perform a sample extract. At this point, all blocks have the result, but
+    // we do the computation at block 0 to avoid waiting for extra blocks, in
+    // case they're not synchronized
+    sample_extract_mask<Torus, params>(block_lwe_out, accumulator);
+  } else if (blockIdx.x == 0 && blockIdx.y == 1) {
+    sample_extract_body<Torus, params>(block_lwe_out, accumulator);
+  }
+  
+}
+
+
+/*
+ * Host wrapper to the low latency version
+ * of bootstrapping
+ */
+template <typename Torus, class params>
+__host__ void host_bootstrap_low_latency(
+    void *v_stream,
+    Torus *lwe_out,
+    Torus *lut_vector,
+    uint32_t *lut_vector_indexes,
+    Torus *lwe_in,
+    double2 *bootstrapping_key,
+    uint32_t lwe_mask_size,
+    uint32_t polynomial_size,
+    uint32_t base_log,
+    uint32_t l_gadget,
+    uint32_t num_samples,
+    uint32_t num_lut_vectors) {
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+
+  int buffer_size_per_gpu = l_gadget * num_samples * polynomial_size / 2 * sizeof(double2);
+  double2 *mask_buffer_fft;
+  double2 *body_buffer_fft;
+  checkCudaErrors(cudaMalloc((void **)&mask_buffer_fft, buffer_size_per_gpu));
+  checkCudaErrors(cudaMalloc((void **)&body_buffer_fft, buffer_size_per_gpu));
+
+
+  int bytes_needed =
+      sizeof(int16_t) * polynomial_size +   // accumulator_decomp
+      sizeof(Torus) * polynomial_size +   // accumulator
+      sizeof(double2) * polynomial_size / 2;  // accumulator fft
+
+  int thds = polynomial_size / params::opt;
+  dim3 grid(l_gadget, 2, num_samples);
+
+  void *kernel_args[10];
+  kernel_args[0] = &lwe_out;
+  kernel_args[1] = &lut_vector;
+  kernel_args[2] = &lwe_in;
+  kernel_args[3] = &bootstrapping_key;
+  kernel_args[4] = &mask_buffer_fft;
+  kernel_args[5] = &body_buffer_fft;
+  kernel_args[6] = &lwe_mask_size;
+  kernel_args[7] = &polynomial_size;
+  kernel_args[8] = &base_log;
+  kernel_args[9] =&l_gadget;
+
+  checkCudaErrors(cudaFuncSetAttribute(device_bootstrap_low_latency<Torus,
+                                                                    params>,
+                           cudaFuncAttributeMaxDynamicSharedMemorySize,
+                           bytes_needed));
+  cudaFuncSetCacheConfig(device_bootstrap_low_latency<Torus, params>,
+                             cudaFuncCachePreferShared);
+  
+  checkCudaErrors(cudaLaunchCooperativeKernel ( (void *)device_bootstrap_low_latency<Torus, params>, grid, thds,  (void**)kernel_args, bytes_needed, *stream )) ;     
+  
+  // Synchronize the streams before copying the result to lwe_out at the right
+  // place
+  cudaStreamSynchronize(*stream);
+  cudaFree(mask_buffer_fft);
+  cudaFree(body_buffer_fft);
+}
+
+#endif // LOWLAT_PBS_H
--- a/src/complex/operations.cuh
+++ b/src/complex/operations.cuh
@@ -0,0 +1,87 @@
+#ifndef GPU_BOOTSTRAP_COMMON_CUH
+#define GPU_BOOTSTRAP_COMMON_CUH
+
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+
+#define SNT 1
+#define dPI 6.283185307179586231995926937088
+
+using sTorus = int32_t;
+// using Torus = uint32_t;
+using sTorus = int32_t;
+using u32 = uint32_t;
+using i32 = int32_t;
+
+//--------------------------------------------------
+// Basic double2 operations
+
+__device__ inline double2 conjugate(const double2 num) {
+  double2 res;
+  res.x = num.x;
+  res.y = -num.y;
+  return res;
+}
+
+__device__ inline void operator+=(double2 &lh, const double2 rh) {
+  lh.x += rh.x;
+  lh.y += rh.y;
+}
+
+__device__ inline void operator-=(double2 &lh, const double2 rh) {
+  lh.x -= rh.x;
+  lh.y -= rh.y;
+}
+
+__device__ inline double2 operator+(const double2 a, const double2 b) {
+  double2 res;
+  res.x = a.x + b.x;
+  res.y = a.y + b.y;
+  return res;
+}
+
+__device__ inline double2 operator-(const double2 a, const double2 b) {
+  double2 res;
+  res.x = a.x - b.x;
+  res.y = a.y - b.y;
+  return res;
+}
+
+__device__ inline double2 operator*(const double2 a, const double2 b) {
+  double xx = a.x * b.x;
+  double xy = a.x * b.y;
+  double yx = a.y * b.x;
+  double yy = a.y * b.y;
+
+  double2 res;
+  // asm volatile("fma.rn.f64 %0, %1, %2, %3;": "=d"(res.x) : "d"(a.x),
+  // "d"(b.x), "d"(yy));
+  res.x = xx - yy;
+  res.y = xy + yx;
+  return res;
+}
+
+__device__ inline double2 operator*(const double2 a, double b) {
+  double2 res;
+  res.x = a.x * b;
+  res.y = a.y * b;
+  return res;
+}
+
+__device__ inline void operator*=(double2 &a, const double2 b) {
+  double tmp = a.x;
+  a.x *= b.x;
+  a.x -= a.y * b.y;
+  a.y *= b.x;
+  a.y += b.y * tmp;
+}
+
+__device__ inline double2 operator*(double a, double2 b) {
+  double2 res;
+  res.x = b.x * a;
+  res.y = b.y * a;
+  return res;
+}
+
+#endif
--- a/src/crypto/bootstrapping_key.cuh
+++ b/src/crypto/bootstrapping_key.cuh
@@ -0,0 +1,157 @@
+#ifndef CNCRT_BSK_H
+#define CNCRT_BSK_H
+
+#include "bootstrap.h"
+#include "polynomial/parameters.cuh"
+#include "polynomial/polynomial.cuh"
+#include <atomic>
+#include <cstdint>
+
+__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
+                                         int glwe_dimension,
+                                         uint32_t l_gadget) {
+    return i * polynomial_size / 2  * (glwe_dimension + 1) * (glwe_dimension + 1) * l_gadget;
+}
+
+__device__ double2*
+get_ith_mask_kth_block(double2* ptr, int i, int k, int level, uint32_t polynomial_size,
+                       int glwe_dimension, uint32_t l_gadget) {
+  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
+          level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
+          k * polynomial_size / 2 * (glwe_dimension + 1)];
+}
+
+__device__ double2*
+get_ith_body_kth_block(double2 *ptr, int i, int k, int level, uint32_t polynomial_size,
+                       int glwe_dimension, uint32_t l_gadget) {
+    return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension, l_gadget) +
+                level * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) +
+                k * polynomial_size / 2 * (glwe_dimension + 1) +
+                polynomial_size / 2];
+}
+
+void cuda_initialize_twiddles(uint32_t polynomial_size, uint32_t gpu_index) {
+  cudaSetDevice(gpu_index);
+  int sw_size = polynomial_size / 2;
+  short *sw1_h, *sw2_h;
+
+  sw1_h = (short *)malloc(sizeof(short) * sw_size);
+  sw2_h = (short *)malloc(sizeof(short) * sw_size);
+
+  memset(sw1_h, 0, sw_size * sizeof(short));
+  memset(sw2_h, 0, sw_size * sizeof(short));
+  int cnt = 0;
+  for (int i = 1, j = 0; i < polynomial_size / 2; i++) {
+    int bit = (polynomial_size / 2) >> 1;
+    for (; j & bit; bit >>= 1)
+      j ^= bit;
+    j ^= bit;
+
+    if (i < j) {
+      sw1_h[cnt] = i;
+      sw2_h[cnt] = j;
+      cnt++;
+    }
+  }
+  cudaMemcpyToSymbol(SW1, sw1_h, sw_size * sizeof(short), 0,
+                     cudaMemcpyHostToDevice);
+  cudaMemcpyToSymbol(SW2, sw2_h, sw_size * sizeof(short), 0,
+                     cudaMemcpyHostToDevice);
+  free(sw1_h);
+  free(sw2_h);
+}
+
+template <typename T, typename ST>
+void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
+                               uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
+                               uint32_t l_gadget, uint32_t polynomial_size) {
+
+  cudaSetDevice(gpu_index);
+  int shared_memory_size = sizeof(double) * polynomial_size;
+
+  int total_polynomials =
+      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
+      l_gadget;
+
+  // Here the buffer size is the size of double2 times the number of polynomials
+  // times the polynomial size over 2 because the polynomials are compressed
+  // into the complex domain to perform the FFT
+  size_t buffer_size = total_polynomials * polynomial_size / 2 * sizeof
+                       (double2);
+
+  int gridSize = total_polynomials;
+  int blockSize = polynomial_size / choose_opt(polynomial_size);
+  // todo(Joao): let's use cudaMallocHost here,
+  // since it allocates page-staged memory which allows
+  // faster data copy
+  double2 *h_bsk = (double2 *)malloc(buffer_size);
+  double2 *d_bsk;
+  cudaMalloc((void **)&d_bsk, buffer_size);
+
+  // compress real bsk to complex and divide it on DOUBLE_MAX
+  for (int i = 0; i < total_polynomials; i++) {
+    int complex_current_poly_idx = i * polynomial_size / 2;
+    int torus_current_poly_idx = i * polynomial_size;
+    for (int j = 0; j < polynomial_size / 2; j++) {
+      h_bsk[complex_current_poly_idx + j].x =
+          src[torus_current_poly_idx + 2 * j];
+      h_bsk[complex_current_poly_idx + j].y =
+          src[torus_current_poly_idx + 2 * j + 1];
+      h_bsk[complex_current_poly_idx + j].x /=
+          (double)std::numeric_limits<T>::max();
+      h_bsk[complex_current_poly_idx + j].y /=
+          (double)std::numeric_limits<T>::max();
+    }
+  }
+
+  cudaMemcpy(d_bsk, h_bsk, buffer_size, cudaMemcpyHostToDevice);
+
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  switch (polynomial_size) {
+    // FIXME (Agnes): check if polynomial sizes are ok
+  case 512:
+    batch_NSMFFT<FFTDegree<Degree<512>, ForwardFFT>>
+    <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    break;
+  case 1024:
+    batch_NSMFFT<FFTDegree<Degree<1024>, ForwardFFT>>
+    <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    break;
+  case 2048:
+    batch_NSMFFT<FFTDegree<Degree<2048>, ForwardFFT>>
+    <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    break;
+  case 4096:
+    batch_NSMFFT<FFTDegree<Degree<4096>, ForwardFFT>>
+    <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    break;
+  case 8192:
+    batch_NSMFFT<FFTDegree<Degree<8192>, ForwardFFT>>
+    <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest);
+    break;
+  default:
+    break;
+  }
+
+  cudaFree(d_bsk);
+  free(h_bsk);
+
+}
+
+void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
+                               uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
+                               uint32_t l_gadget, uint32_t polynomial_size) {
+  cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>((double2 *)dest, (int32_t *)src,
+                                           v_stream, gpu_index, input_lwe_dim,
+                                           glwe_dim, l_gadget, polynomial_size);
+}
+
+void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
+                                  uint32_t gpu_index, uint32_t input_lwe_dim, uint32_t glwe_dim,
+                                  uint32_t l_gadget, uint32_t polynomial_size) {
+  cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>((double2 *)dest, (int64_t *)src,
+                                           v_stream, gpu_index, input_lwe_dim,
+                                           glwe_dim, l_gadget, polynomial_size);
+}
+
+#endif // CNCRT_BSK_H
--- a/src/crypto/gadget.cuh
+++ b/src/crypto/gadget.cuh
@@ -0,0 +1,91 @@
+#ifndef CNCRT_CRYPTO_H
+#define CNCRT_CRPYTO_H
+
+#include "polynomial/polynomial.cuh"
+#include <cstdint>
+
+template <typename T, class params> class GadgetMatrix {
+private:
+  uint32_t l_gadget;
+  uint32_t base_log;
+  uint32_t mask;
+  uint32_t halfbg;
+  T offset;
+
+public:
+  __device__ GadgetMatrix(uint32_t base_log, uint32_t l_gadget)
+      : base_log(base_log), l_gadget(l_gadget) {
+    uint32_t bg = 1 << base_log;
+    this->halfbg = bg / 2;
+    this->mask = bg - 1;
+    T temp = 0;
+    for (int i = 0; i < this->l_gadget; i++) {
+      temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
+    }
+    this->offset = temp * this->halfbg;
+  }
+
+  template <typename V, typename U>
+  __device__ void decompose_one_level(Polynomial<V, params> &result,
+                                      Polynomial<U, params> &polynomial,
+                                      uint32_t level) {
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      T s = polynomial.coefficients[tid] + this->offset;
+      uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
+      T temp1 = (s >> decal) & this->mask;
+      result.coefficients[tid] = (V)(temp1 - this->halfbg);
+      tid += params::degree / params::opt;
+    }
+  }
+  template <typename V, typename U>
+  __device__ void decompose_one_level(V *result, U *polynomial,
+                                      uint32_t level) {
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      T s = polynomial[tid] + this->offset;
+      uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
+      T temp1 = (s >> decal) & this->mask;
+      result[tid] = (V)(temp1 - this->halfbg);
+      tid += params::degree / params::opt;
+    }
+  }
+
+  __device__ T decompose_one_level_single(T element, uint32_t level) {
+    T s = element + this->offset;
+    uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
+    T temp1 = (s >> decal) & this->mask;
+    return (T)(temp1 - this->halfbg);
+  }
+};
+
+template <typename T> class GadgetMatrixSingle {
+private:
+  uint32_t l_gadget;
+  uint32_t base_log;
+  uint32_t mask;
+  uint32_t halfbg;
+  T offset;
+
+public:
+  __device__ GadgetMatrixSingle(uint32_t base_log, uint32_t l_gadget)
+      : base_log(base_log), l_gadget(l_gadget) {
+    uint32_t bg = 1 << base_log;
+    this->halfbg = bg / 2;
+    this->mask = bg - 1;
+    T temp = 0;
+    for (int i = 0; i < this->l_gadget; i++) {
+      temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
+    }
+    this->offset = temp * this->halfbg;
+  }
+
+  __device__ T decompose_one_level_single(T element, uint32_t level) {
+    T s = element + this->offset;
+    uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
+    T temp1 = (s >> decal) & this->mask;
+    return (T)(temp1 - this->halfbg);
+  }
+};
+
+#endif // CNCRT_CRPYTO_H
--- a/src/crypto/torus.cuh
+++ b/src/crypto/torus.cuh
@@ -0,0 +1,45 @@
+#ifndef CNCRT_TORUS_H
+#define CNCRT_TORUS_H
+
+#include "types/int128.cuh"
+#include <limits>
+
+template <typename Torus>
+__device__ inline Torus typecast_double_to_torus(double x) {
+  if constexpr (sizeof(Torus) < 8) {
+    // this simple cast works up to 32 bits, afterwards we must do it manually
+    long long ret = x;
+    return (Torus)ret;
+  } else {
+    int128 nnnn = make_int128_from_float(x);
+    uint64_t lll = nnnn.lo_;
+    return lll;
+  }
+  // nvcc doesn't get it that the if {} else {} above should always return
+  // something, and complains that this function might return nothing, so we
+  // put this useless return here
+  return 0;
+}
+
+template <typename T>
+__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
+                                              uint32_t l_gadget) {
+  T shift = sizeof(T) * 8 - l_gadget * base_log;
+  T mask = 1ll << (shift - 1);
+  T b = (x & mask) >> (shift - 1);
+  T res = x >> shift;
+  res += b;
+  res <<= shift;
+  return res;
+}
+
+template <typename T>
+__device__ __forceinline__ T rescale_torus_element(T element,
+                                                   uint32_t log_shift) {
+  // todo(Joao): not sure if this works
+  // return element >> log_shift;
+  return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
+               (double)log_shift);
+}
+
+#endif // CNCRT_TORUS_H
--- a/src/device.cu
+++ b/src/device.cu
@@ -0,0 +1,147 @@
+#include "device.h"
+#include <cstdint>
+#include <cstdio>
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+
+/// Unsafe function to create a CUDA stream, must check first that GPU exists
+void *cuda_create_stream(uint32_t gpu_index) {
+  cudaSetDevice(gpu_index);
+  cudaStream_t *stream = new cudaStream_t;
+  cudaStreamCreate(stream);
+  return stream;
+}
+
+/// Unsafe function to destroy CUDA stream, must check first the GPU exists
+int cuda_destroy_stream(void *v_stream, uint32_t gpu_index) {
+  cudaSetDevice(gpu_index);
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  cudaStreamDestroy(*stream);
+  return 0;
+}
+
+/// Unsafe function that will try to allocate even if gpu_index is invalid
+/// or if there's not enough memory. A safe wrapper around it must call
+/// cuda_check_valid_malloc() first
+void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
+  cudaSetDevice(gpu_index);
+  void *ptr;
+  checkCudaErrors(cudaMalloc((void **)&ptr, size));
+
+  return ptr;
+}
+
+/// Checks that allocation is valid
+/// 0: valid
+/// -1: invalid, not enough memory in device
+/// -2: invalid, gpu index doesn't exist
+int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
+
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  size_t total_mem, free_mem;
+  cudaMemGetInfo(&free_mem, &total_mem);
+  if (size > free_mem) {
+    // error code: not enough memory
+    return -1;
+  }
+  return 0;
+}
+
+/// Tries to copy memory to the GPU asynchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+                             void *v_stream, uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, dest);
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  cudaSetDevice(gpu_index);
+  checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, 
+                                  *stream));
+  return 0;
+}
+
+/// Synchronizes device
+/// 0: success
+/// -2: error, gpu index doesn't exist
+int cuda_synchronize_device(uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  cudaDeviceSynchronize();
+  return 0;
+}
+
+/// Tries to copy memory to the GPU asynchronously
+/// 0: success
+/// -1: error, invalid device pointer
+/// -2: error, gpu index doesn't exist
+int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+                             void *v_stream, uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaPointerAttributes attr;
+  cudaPointerGetAttributes(&attr, src);
+  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
+    // error code: invalid device pointer
+    return -1;
+  }
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  cudaSetDevice(gpu_index);
+  checkCudaErrors(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, 
+                                  *stream));
+  return 0;
+}
+
+/// Return number of GPUs available
+int cuda_get_number_of_gpus() {
+  int num_gpus;
+  cudaGetDeviceCount(&num_gpus);
+  return num_gpus;
+}
+
+/// Drop a cuda array
+int cuda_drop(void *ptr, uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  checkCudaErrors(cudaFree(ptr));
+  return 0;
+}
+
+/// Get the maximum size for the shared memory
+int cuda_get_max_shared_memory(uint32_t gpu_index) {
+  if (gpu_index >= cuda_get_number_of_gpus()) {
+    // error code: invalid gpu_index
+    return -2;
+  }
+  cudaSetDevice(gpu_index);
+  cudaDeviceProp prop;
+  cudaGetDeviceProperties(&prop, gpu_index);
+  int max_shared_memory = 0;
+  if (prop.major > 7) {
+    max_shared_memory = prop.sharedMemPerMultiprocessor;
+  } else {
+    max_shared_memory = prop.sharedMemPerBlock;
+  }
+  return max_shared_memory;
+}
--- a/src/fft/bnsmfft.cuh
+++ b/src/fft/bnsmfft.cuh
@@ -0,0 +1,753 @@
+#ifndef GPU_BOOTSTRAP_FFT_1024_CUH
+#define GPU_BOOTSTRAP_FFT_1024_CUH
+
+#include "complex/operations.cuh"
+#include "polynomial/functions.cuh"
+#include "polynomial/parameters.cuh"
+#include "twiddles.cuh"
+
+/*
+ * bit reverse
+ *  coefficient bits are reversed based on precalculated indexes
+ *  SW1 and SW2
+ */
+template <class params> __device__ void bit_reverse_inplace(double2 *A) {
+  int tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    short sw1 = SW1[tid];
+    short sw2 = SW2[tid];
+    double2 tmp = A[sw1];
+    A[sw1] = A[sw2];
+    A[sw2] = tmp;
+    tid += params::degree / params::opt;
+  }
+}
+
+/*
+ *  negacyclic twiddle
+ *  returns negacyclic twiddle based on degree and index
+ *  twiddles are precalculated inside negTwids{3..13} arrays
+ */
+template <int degree> __device__ double2 negacyclic_twiddle(int j) {
+  double2 twid;
+  switch (degree) {
+  case 512:
+    twid = negTwids9[j];
+    break;
+  case 1024:
+    twid = negTwids10[j];
+    break;
+  case 2048:
+    twid = negTwids11[j];
+    break;
+  case 4096:
+    twid = negTwids12[j];
+    break;
+  case 8192:
+    twid = negTwids13[j];
+  default:
+    twid.x = 0;
+    twid.y = 0;
+    break;
+  }
+  return twid;
+}
+
+/*
+ * Direct negacyclic FFT:
+ *   - before the FFT the N real coefficients are stored into a
+ *     N/2 sized complex with the even coefficients in the real part
+ *     and the odd coefficients in the imaginary part. This is referred to
+ *     as the half-size FFT
+ *   - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
+ *     opt is divided by 2 because the butterfly pattern is always applied
+ *     between pairs of coefficients
+ *   - instead of twisting each coefficient A_j before the FFT by
+ *     multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
+ *     the FFT is modified, and for each level k of the FFT the twiddle:
+ *     w_j,k = exp(-i pi j/2^k)
+ *     is replaced with:
+ *     \zeta_j,k = exp(-i pi (2j-1)/2^k)
+ *   - this technique also implies a correction of the
+ *     complex obtained after the FFT, which is done in the
+ * forward_negacyclic_fft_inplace function of bootstrap.cuh
+ */
+template <class params> __device__ void NSMFFT_direct(double2 *A) {
+  /* First, reverse the bits of the input complex
+   *  The bit reversal for half-size FFT has been stored into the
+   *  SW1 and SW2 arrays beforehand
+   *  Each thread is always in charge of "opt/2" pairs of coefficients,
+   *  which is why we always loop through N/2 by N/opt strides
+   *  The pragma unroll instruction tells the compiler to unroll the
+   *  full loop, which should increase performance TODO (Agnes) check this
+   */
+  bit_reverse_inplace<params>(A);
+  __syncthreads();
+
+  // Now we go through all the levels of the FFT one by one
+  // (instead of recursively)
+  // first iteration: k=1, zeta=i for all coefficients
+  int tid = threadIdx.x;
+  int i1, i2;
+  double2 u, v;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 2
+    i1 = tid << 1;
+    i2 = i1 + 1;
+    u = A[i1];
+    // v = i*A[i2]
+    v.y = A[i2].x;
+    v.x = -A[i2].y;
+    // A[i1] <- A[i1] + i*A[i2]
+    // A[i2] <- A[i1] - i*A[i2]
+    A[i1] += v;
+    A[i2] = u - v;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // second iteration: apply the butterfly pattern
+  // between groups of 4 coefficients
+  // k=2, \zeta=exp(i pi/4) for even coefficients and
+  // exp(3 i pi / 4) for odd coefficients
+  // TODO (Agnes) how does this work on the gpu? aren't we doing
+  // a lot more computations than we should?
+  tid = threadIdx.x;
+  // odd = 0 for even coefficients, 1 for odd coefficients
+  int odd = tid & 1;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 2
+    // i1=2*tid if tid is even and 2*tid-1 if it is odd
+    i1 = (tid << 1) - odd;
+    i2 = i1 + 2;
+
+    double a = A[i2].x;
+    double b = A[i2].y;
+    u = A[i1];
+
+    // \zeta_j,2 = exp(-i pi (2j-1)/4) -> j=0: exp(i pi/4) or j=1: exp(-i pi/4)
+    // \zeta_even = sqrt(2)/2 + i * sqrt(2)/2 = sqrt(2)/2*(1+i)
+    // \zeta_odd  = sqrt(2)/2 - i * sqrt(2)/2 = sqrt(2)/2*(1-i)
+
+    // v_j = \zeta_j * (a+i*b)
+    // v_even = sqrt(2)/2*((a-b)+i*(a+b))
+    // v_odd = sqrt(2)/2*(a+b+i*(b-a))
+    v.x =
+        (odd) ? (-0.707106781186548) * (a + b) : (0.707106781186548) * (a - b);
+    v.y = (odd) ? (0.707106781186548) * (a - b) : (0.707106781186548) * (a + b);
+
+    // v.x = (0.707106781186548 * odd) * (a + b) + (0.707106781186548 * (!odd))
+    // * (a - b); v.y = (0.707106781186548 * odd) * (b - a) + (0.707106781186548
+    // * (!odd)) * (a + b);
+
+    A[i1] = u + v;
+    A[i2] = u - v;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // third iteration
+  // from k=3 on, we have to do the full complex multiplication
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 4
+    // rem is the remainder of tid/4. tid takes values:
+    // 0, 1, 2, 3, 4, 5, 6, 7, ... N/4
+    // then rem takes values:
+    // 0, 1, 2, 3, 0, 1, 2, 3, ... N/4
+    // and striding by 4 will allow us to cover all
+    // the coefficients correctly
+    int rem = tid & 3;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 4;
+
+    double2 w = negTwids3[rem];
+    u = A[i1], v = A[i2] * w;
+
+    A[i1] = u + v;
+    A[i2] = u - v;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 4_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 8
+    // rem is the remainder of tid/8. tid takes values:
+    // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... N/4
+    // then rem takes values:
+    // 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, ... N/4
+    // and striding by 8 will allow us to cover all
+    // the coefficients correctly
+    int rem = tid & 7;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 8;
+
+    double2 w = negTwids4[rem];
+    u = A[i1], v = A[i2] * w;
+    A[i1] = u + v;
+    A[i2] = u - v;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 5_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 16
+    // rem is the remainder of tid/16
+    // and the same logic as for previous iterations applies
+    int rem = tid & 15;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 16;
+    double2 w = negTwids5[rem];
+    u = A[i1], v = A[i2] * w;
+    A[i1] = u + v;
+    A[i2] = u - v;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 6_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 32
+    // rem is the remainder of tid/32
+    // and the same logic as for previous iterations applies
+    int rem = tid & 31;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 32;
+    double2 w = negTwids6[rem];
+    u = A[i1], v = A[i2] * w;
+    A[i1] = u + v;
+    A[i2] = u - v;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 7_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 64
+    // rem is the remainder of tid/64
+    // and the same logic as for previous iterations applies
+    int rem = tid & 63;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 64;
+    double2 w = negTwids7[rem];
+    u = A[i1], v = A[i2] * w;
+    A[i1] = u + v;
+    A[i2] = u - v;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 8_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 128
+    // rem is the remainder of tid/128
+    // and the same logic as for previous iterations applies
+    int rem = tid & 127;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 128;
+    double2 w = negTwids8[rem];
+    u = A[i1], v = A[i2] * w;
+    A[i1] = u + v;
+    A[i2] = u - v;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+  if constexpr (params::log2_degree > 8) {
+    // 9_th iteration
+    tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      // the butterfly pattern is applied to each pair
+      // of coefficients, with a stride of 256
+      // rem is the remainder of tid/256
+      // and the same logic as for previous iterations applies
+      int rem = tid & 255;
+      i1 = (tid << 1) - rem;
+      i2 = i1 + 256;
+      double2 w = negTwids9[rem];
+      u = A[i1], v = A[i2] * w;
+      A[i1] = u + v;
+      A[i2] = u - v;
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+  if constexpr (params::log2_degree > 9) {
+    // 10_th iteration
+    tid = threadIdx.x;
+    //#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      // the butterfly pattern is applied to each pair
+      // of coefficients, with a stride of 512
+      // rem is the remainder of tid/512
+      // and the same logic as for previous iterations applies
+      int rem = tid & 511;
+      i1 = (tid << 1) - rem;
+      i2 = i1 + 512;
+      double2 w = negTwids10[rem];
+      u = A[i1], v = A[i2] * w;
+      A[i1] = u + v;
+      A[i2] = u - v;
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::log2_degree > 10) {
+    // 11_th iteration
+    tid = threadIdx.x;
+    //#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      // the butterfly pattern is applied to each pair
+      // of coefficients, with a stride of 1024
+      // rem is the remainder of tid/1024
+      // and the same logic as for previous iterations applies
+      int rem = tid & 1023;
+      i1 = (tid << 1) - rem;
+      i2 = i1 + 1024;
+      double2 w = negTwids11[rem];
+      u = A[i1], v = A[i2] * w;
+      A[i1] = u + v;
+      A[i2] = u - v;
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::log2_degree > 11) {
+    // 12_th iteration
+    tid = threadIdx.x;
+    //#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      // the butterfly pattern is applied to each pair
+      // of coefficients, with a stride of 2048
+      // rem is the remainder of tid/2048
+      // and the same logic as for previous iterations applies
+      int rem = tid & 2047;
+      i1 = (tid << 1) - rem;
+      i2 = i1 + 2048;
+      double2 w = negTwids12[rem];
+      u = A[i1], v = A[i2] * w;
+      A[i1] = u + v;
+      A[i2] = u - v;
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+  // Real polynomials handled should not exceed a degree of 8192
+}
+
+/*
+ * negacyclic inverse fft
+ */
+template <class params> __device__ void NSMFFT_inverse(double2 *A) {
+  /* First, reverse the bits of the input complex
+   *  The bit reversal for half-size FFT has been stored into the
+   *  SW1 and SW2 arrays beforehand
+   *  Each thread is always in charge of "opt/2" pairs of coefficients,
+   *  which is why we always loop through N/2 by N/opt strides
+   *  The pragma unroll instruction tells the compiler to unroll the
+   *  full loop, which should increase performance TODO (Agnes) check this
+   */
+  int tid;
+  int i1, i2;
+  double2 u, v;
+  if constexpr (params::log2_degree > 11) {
+    // 12_th iteration
+    tid = threadIdx.x;
+    //#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      // the butterfly pattern is applied to each pair
+      // of coefficients, with a stride of 2048
+      // rem is the remainder of tid/2048
+      // and the same logic as for previous iterations applies
+      int rem = tid & 2047;
+      i1 = (tid << 1) - rem;
+      i2 = i1 + 2048;
+      double2 w = conjugate(negTwids12[rem]);
+      u = A[i1], v = A[i2];
+      A[i1] = (u + v) * 0.5;
+      A[i2] = (u - v) * w * 0.5;
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::log2_degree > 10) {
+    // 11_th iteration
+    tid = threadIdx.x;
+    //#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      // the butterfly pattern is applied to each pair
+      // of coefficients, with a stride of 1024
+      // rem is the remainder of tid/1024
+      // and the same logic as for previous iterations applies
+      int rem = tid & 1023;
+      i1 = (tid << 1) - rem;
+      i2 = i1 + 1024;
+      double2 w = conjugate(negTwids11[rem]);
+      u = A[i1], v = A[i2];
+      A[i1] = (u + v) * 0.5;
+      A[i2] = (u - v) * w * 0.5;
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::log2_degree > 9) {
+    // 10_th iteration
+    tid = threadIdx.x;
+    //#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      // the butterfly pattern is applied to each pair
+      // of coefficients, with a stride of 512
+      // rem is the remainder of tid/512
+      // and the same logic as for previous iterations applies
+      int rem = tid & 511;
+      i1 = (tid << 1) - rem;
+      i2 = i1 + 512;
+      double2 w = conjugate(negTwids10[rem]);
+      u = A[i1], v = A[i2];
+      A[i1] = (u + v) * 0.5;
+      A[i2] = (u - v) * w * 0.5;
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  if constexpr (params::log2_degree > 8) {
+    // 9_th iteration
+    tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      // the butterfly pattern is applied to each pair
+      // of coefficients, with a stride of 256
+      // rem is the remainder of tid/256
+      // and the same logic as for previous iterations applies
+      int rem = tid & 255;
+      i1 = (tid << 1) - rem;
+      i2 = i1 + 256;
+      double2 w = conjugate(negTwids9[rem]);
+      u = A[i1], v = A[i2];
+      A[i1] = (u + v) * 0.5;
+      A[i2] = (u - v) * w * 0.5;
+      tid += params::degree / params::opt;
+    }
+    __syncthreads();
+  }
+
+  // 8_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 128
+    // rem is the remainder of tid/128
+    // and the same logic as for previous iterations applies
+    int rem = tid & 127;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 128;
+    double2 w = conjugate(negTwids8[rem]);
+    u = A[i1], v = A[i2];
+    A[i1] = (u + v) * 0.5;
+    A[i2] = (u - v) * w * 0.5;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 7_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 64
+    // rem is the remainder of tid/64
+    // and the same logic as for previous iterations applies
+    int rem = tid & 63;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 64;
+    double2 w = conjugate(negTwids7[rem]);
+    u = A[i1], v = A[i2];
+    A[i1] = (u + v) * 0.5;
+    A[i2] = (u - v) * w * 0.5;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 6_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 32
+    // rem is the remainder of tid/32
+    // and the same logic as for previous iterations applies
+    int rem = tid & 31;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 32;
+    double2 w = conjugate(negTwids6[rem]);
+    u = A[i1], v = A[i2];
+    A[i1] = (u + v) * 0.5;
+    A[i2] = (u - v) * w * 0.5;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 5_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 16
+    // rem is the remainder of tid/16
+    // and the same logic as for previous iterations applies
+    int rem = tid & 15;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 16;
+    double2 w = conjugate(negTwids5[rem]);
+    u = A[i1], v = A[i2];
+    A[i1] = (u + v) * 0.5;
+    A[i2] = (u - v) * w * 0.5;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // 4_th iteration
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 8
+    // rem is the remainder of tid/8. tid takes values:
+    // 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... N/4
+    // then rem takes values:
+    // 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, ... N/4
+    // and striding by 8 will allow us to cover all
+    // the coefficients correctly
+    int rem = tid & 7;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 8;
+
+    double2 w = conjugate(negTwids4[rem]);
+    u = A[i1], v = A[i2];
+    A[i1] = (u + v) * 0.5;
+    A[i2] = (u - v) * w * 0.5;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // third iteration
+  // from k=3 on, we have to do the full complex multiplication
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 4
+    // rem is the remainder of tid/4. tid takes values:
+    // 0, 1, 2, 3, 4, 5, 6, 7, ... N/4
+    // then rem takes values:
+    // 0, 1, 2, 3, 0, 1, 2, 3, ... N/4
+    // and striding by 4 will allow us to cover all
+    // the coefficients correctly
+    int rem = tid & 3;
+    i1 = (tid << 1) - rem;
+    i2 = i1 + 4;
+
+    double2 w = conjugate(negTwids3[rem]);
+    u = A[i1], v = A[i2];
+    A[i1] = (u + v) * 0.5;
+    A[i2] = (u - v) * w * 0.5;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // second iteration: apply the butterfly pattern
+  // between groups of 4 coefficients
+  // k=2, \zeta=exp(i pi/4) for even coefficients and
+  // exp(3 i pi / 4) for odd coefficients
+  // TODO (Agnes) how does this work on the gpu? aren't we doing
+  // a lot more computations than we should?
+  tid = threadIdx.x;
+  // odd = 0 for even coefficients, 1 for odd coefficients
+  int odd = tid & 1;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 2
+    // i1=2*tid if tid is even and 2*tid-1 if it is odd
+    i1 = (tid << 1) - odd;
+    i2 = i1 + 2;
+
+    // TODO(Beka) optimize twiddle multiplication
+    double2 w;
+    if (odd) {
+      w.x = -0.707106781186547461715008466854;
+      w.y = -0.707106781186547572737310929369;
+    } else {
+      w.x = 0.707106781186547461715008466854;
+      w.y = -0.707106781186547572737310929369;
+    }
+
+    u = A[i1], v = A[i2];
+    A[i1] = (u + v) * 0.5;
+    A[i2] = (u - v) * w * 0.5;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  // Now we go through all the levels of the FFT one by one
+  // (instead of recursively)
+  // first iteration: k=1, zeta=i for all coefficients
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    // the butterfly pattern is applied to each pair
+    // of coefficients, with a stride of 2
+    i1 = tid << 1;
+    i2 = i1 + 1;
+    // TODO(Beka) optimize twiddle multiplication
+    double2 w = {0, -1};
+    u = A[i1], v = A[i2];
+    A[i1] = (u + v) * 0.5;
+    A[i2] = (u - v) * w * 0.5;
+    tid += params::degree / params::opt;
+  }
+  __syncthreads();
+
+  bit_reverse_inplace<params>(A);
+  __syncthreads();
+  // Real polynomials handled should not exceed a degree of 8192
+}
+
+/*
+ *  correction after direct fft
+ *  does not use extra shared memory for recovering
+ *  correction is done using registers.
+ *  based on Pascal's paper
+ */
+template <class params>
+__device__ void correction_direct_fft_inplace(double2 *x) {
+  constexpr int threads = params::degree / params::opt;
+  int tid = threadIdx.x;
+  double2 left[params::opt / 4];
+  double2 right[params::opt / 4];
+#pragma unroll
+  for (int i = 0; i < params::opt / 4; i++) {
+    left[i] = x[tid + i * threads];
+  }
+#pragma unroll
+  for (int i = 0; i < params::opt / 4; i++) {
+    right[i] = x[params::degree / 2 - (tid + i * threads + 1)];
+  }
+#pragma unroll
+  for (int i = 0; i < params::opt / 4; i++) {
+    double2 tw = negacyclic_twiddle<params::degree>(tid + i * threads);
+    double add_RE = left[i].x + right[i].x;
+    double sub_RE = left[i].x - right[i].x;
+    double add_IM = left[i].y + right[i].y;
+    double sub_IM = left[i].y - right[i].y;
+
+    double tmp1 = add_IM * tw.x + sub_RE * tw.y;
+    double tmp2 = -sub_RE * tw.x + add_IM * tw.y;
+    x[tid + i * threads].x = (add_RE + tmp1) * 0.5;
+    x[tid + i * threads].y = (sub_IM + tmp2) * 0.5;
+    x[params::degree / 2 - (tid + i * threads + 1)].x = (add_RE - tmp1) * 0.5;
+    x[params::degree / 2 - (tid + i * threads + 1)].y = (-sub_IM + tmp2) * 0.5;
+  }
+}
+
+/*
+ *  correction before inverse fft
+ *  does not use extra shared memory for recovering
+ *  correction is done using registers.
+ *  based on Pascal's paper
+ */
+template <class params>
+__device__ void correction_inverse_fft_inplace(double2 *x) {
+  constexpr int threads = params::degree / params::opt;
+  int tid = threadIdx.x;
+  double2 left[params::opt / 4];
+  double2 right[params::opt / 4];
+#pragma unroll
+  for (int i = 0; i < params::opt / 4; i++) {
+    left[i] = x[tid + i * threads];
+  }
+#pragma unroll
+  for (int i = 0; i < params::opt / 4; i++) {
+    right[i] = x[params::degree / 2 - (tid + i * threads + 1)];
+  }
+#pragma unroll
+  for (int i = 0; i < params::opt / 4; i++) {
+    double2 tw = negacyclic_twiddle<params::degree>(tid + i * threads);
+    double add_RE = left[i].x + right[i].x;
+    double sub_RE = left[i].x - right[i].x;
+    double add_IM = left[i].y + right[i].y;
+    double sub_IM = left[i].y - right[i].y;
+
+    double tmp1 = add_IM * tw.x - sub_RE * tw.y;
+    double tmp2 = sub_RE * tw.x + add_IM * tw.y;
+    x[tid + i * threads].x = (add_RE - tmp1) * 0.5;
+    x[tid + i * threads].y = (sub_IM + tmp2) * 0.5;
+    x[params::degree / 2 - (tid + i * threads + 1)].x = (add_RE + tmp1) * 0.5;
+    x[params::degree / 2 - (tid + i * threads + 1)].y = (-sub_IM + tmp2) * 0.5;
+  }
+}
+
+/*
+ * global batch fft
+ * does fft in half size
+ * unrolling halfsize fft result in half size + 1 eleemnts
+ * this function must be called with actual degree
+ * function takes as input already compressed input
+ */
+template <class params>
+__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output) {
+  extern __shared__ double2 sharedMemoryFFT[];
+  double2 *fft = sharedMemoryFFT;
+
+  int tid = threadIdx.x;
+
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
+    tid = tid + params::degree / params::opt;
+  }
+  __syncthreads();
+  NSMFFT_direct<HalfDegree<params>>(fft);
+  __syncthreads();
+  correction_direct_fft_inplace<params>(fft);
+  __syncthreads();
+
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
+    tid = tid + params::degree / params::opt;
+  }
+}
+
+#endif // GPU_BOOTSTRAP_FFT_1024_CUH
--- a/src/fft/smfft.cuh
+++ b/src/fft/smfft.cuh
@@ -0,0 +1,402 @@
+/*
+#ifndef GPU_BOOTSTRAP_SMFFT_CUH
+#define GPU_BOOTSTRAP_SMFFT_CUH
+
+#include "../complex/operations.cuh"
+#include "twiddles.cuh"
+
+__device__ __inline__ double2 Get_W_value_inverse(int index) {
+  double2 ctemp = _gTwiddles[index];
+  ctemp.y = -ctemp.y;
+  return (ctemp);
+}
+template <class params>
+__device__ double2 Get_after_inverse_fft_twiddle(int index) {
+  double2 ctemp;
+  switch (params::degree) {
+  case 512:
+    ctemp = INVERSE_TWIDDLES_512[index];
+    break;
+  case 1024:
+    ctemp = gTwiddles1024[index];
+    ctemp.x /= params::degree;
+    ctemp.y /= -params::degree;
+    break;
+  default:
+    break;
+  }
+
+  return ctemp;
+}
+
+__device__ __inline__ double shfl(double *value, int par) {
+#if (CUDART_VERSION >= 9000)
+  return (__shfl_sync(0xffffffff, (*value), par));
+#else
+  return (__shfl((*value), par));
+#endif
+}
+
+__device__ __inline__ double shfl_xor(double *value, int par) {
+#if (CUDART_VERSION >= 9000)
+  return (__shfl_xor_sync(0xffffffff, (*value), par));
+#else
+  return (__shfl_xor((*value), par));
+#endif
+}
+
+__device__ __inline__ double shfl_down(double *value, int par) {
+#if (CUDART_VERSION >= 9000)
+  return (__shfl_down_sync(0xffffffff, (*value), par));
+#else
+  return (__shfl_down((*value), par));
+#endif
+}
+
+__device__ __inline__ void
+reorder_16_register(double2 *A_DFT_value, double2 *B_DFT_value,
+                    double2 *C_DFT_value, double2 *D_DFT_value, int *local_id) {
+  double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
+  unsigned int target = (((unsigned int)__brev(((*local_id) & 15))) >> (28)) +
+                        16 * ((*local_id) >> 4);
+  Af2temp.x = shfl(&(A_DFT_value->x), target);
+  Af2temp.y = shfl(&(A_DFT_value->y), target);
+  Bf2temp.x = shfl(&(B_DFT_value->x), target);
+  Bf2temp.y = shfl(&(B_DFT_value->y), target);
+  Cf2temp.x = shfl(&(C_DFT_value->x), target);
+  Cf2temp.y = shfl(&(C_DFT_value->y), target);
+  Df2temp.x = shfl(&(D_DFT_value->x), target);
+  Df2temp.y = shfl(&(D_DFT_value->y), target);
+  __syncwarp();
+  (*A_DFT_value) = Af2temp;
+  (*B_DFT_value) = Bf2temp;
+  (*C_DFT_value) = Cf2temp;
+  (*D_DFT_value) = Df2temp;
+}
+
+__device__ __inline__ void reorder_32_register(double2 *A_DFT_value,
+                                               double2 *B_DFT_value,
+                                               double2 *C_DFT_value,
+                                               double2 *D_DFT_value) {
+  double2 Af2temp, Bf2temp, Cf2temp, Df2temp;
+  unsigned int target = ((unsigned int)__brev(threadIdx.x)) >> (27);
+  Af2temp.x = shfl(&(A_DFT_value->x), target);
+  Af2temp.y = shfl(&(A_DFT_value->y), target);
+  Bf2temp.x = shfl(&(B_DFT_value->x), target);
+  Bf2temp.y = shfl(&(B_DFT_value->y), target);
+  Cf2temp.x = shfl(&(C_DFT_value->x), target);
+  Cf2temp.y = shfl(&(C_DFT_value->y), target);
+  Df2temp.x = shfl(&(D_DFT_value->x), target);
+  Df2temp.y = shfl(&(D_DFT_value->y), target);
+  __syncwarp();
+  (*A_DFT_value) = Af2temp;
+  (*B_DFT_value) = Bf2temp;
+  (*C_DFT_value) = Cf2temp;
+  (*D_DFT_value) = Df2temp;
+}
+
+template <class params>
+__device__ __inline__ void
+reorder_512(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
+            double2 *C_DFT_value, double2 *D_DFT_value) {
+  int local_id = threadIdx.x & (params::warp - 1);
+  int warp_id = threadIdx.x / params::warp;
+
+  // reorder elements within warp so we can save them in semi-transposed manner
+  // into shared memory
+  reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
+
+  // reorder elements within warp so we can save them in semi-transposed manner
+  // into shared memory
+  __syncthreads();
+  unsigned int sm_store_pos =
+      (local_id >> 1) + 16 * (local_id & 1) + warp_id * 132;
+  s_input[sm_store_pos] = *A_DFT_value;
+  s_input[sm_store_pos + 33] = *B_DFT_value;
+  s_input[66 + sm_store_pos] = *C_DFT_value;
+  s_input[66 + sm_store_pos + 33] = *D_DFT_value;
+
+  __syncthreads();
+
+  // Read shared memory to get reordered input
+  unsigned int sm_read_pos = (local_id & 15) * 32 + local_id + warp_id * 4;
+  __syncthreads();
+  *A_DFT_value = s_input[sm_read_pos + 0];
+  *B_DFT_value = s_input[sm_read_pos + 1];
+  *C_DFT_value = s_input[sm_read_pos + 2];
+  *D_DFT_value = s_input[sm_read_pos + 3];
+
+  __syncthreads();
+  reorder_16_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value,
+                      &local_id);
+
+  __syncthreads();
+}
+
+template <class params>
+__device__ __inline__ void
+reorder_1024(double2 *s_input, double2 *A_DFT_value, double2 *B_DFT_value,
+             double2 *C_DFT_value, double2 *D_DFT_value) {
+  int local_id = threadIdx.x & (params::warp - 1);
+  int warp_id = threadIdx.x / params::warp;
+
+  // reorder elements within params::warp so we can save them in semi-transposed
+  // manner into shared memory
+  reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
+
+  // reorder elements within params::warp so we can save them in semi-transposed
+  // manner into shared memory
+  __syncthreads();
+  unsigned int sm_store_pos =
+      (local_id >> 0) + 32 * (local_id & 0) + warp_id * 132;
+  s_input[sm_store_pos] = *A_DFT_value;
+  s_input[sm_store_pos + 33] = *B_DFT_value;
+  s_input[66 + sm_store_pos] = *C_DFT_value;
+  s_input[66 + sm_store_pos + 33] = *D_DFT_value;
+
+  // Read shared memory to get reordered input
+  unsigned int sm_read_pos = (local_id & 31) * 32 + local_id + warp_id * 4;
+  __syncthreads();
+  *A_DFT_value = s_input[sm_read_pos + 0];
+  *B_DFT_value = s_input[sm_read_pos + 1];
+  *C_DFT_value = s_input[sm_read_pos + 2];
+  *D_DFT_value = s_input[sm_read_pos + 3];
+
+  __syncthreads();
+  reorder_32_register(A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value);
+}
+
+__device__ bool printOnce = true;
+
+template <class params> __device__ void do_SMFFT_CT_DIT(double2 *s_input) {
+  double2 A_DFT_value, B_DFT_value, C_DFT_value, D_DFT_value;
+  double2 W;
+  double2 Aftemp, Bftemp, Cftemp, Dftemp;
+
+  int j, m_param;
+  int parity, itemp;
+  int A_read_index, B_read_index, C_read_index, D_read_index;
+  int PoT, PoTp1, q;
+
+  int local_id = threadIdx.x & (params::warp - 1);
+  int warp_id = threadIdx.x / params::warp;
+  A_DFT_value = s_input[local_id + (warp_id << 2) * params::warp];
+  B_DFT_value =
+      s_input[local_id + (warp_id << 2) * params::warp + params::warp];
+  C_DFT_value =
+      s_input[local_id + (warp_id << 2) * params::warp + 2 * params::warp];
+  D_DFT_value =
+      s_input[local_id + (warp_id << 2) * params::warp + 3 * params::warp];
+
+  switch (params::log2_degree) {
+  case 9:
+    reorder_512<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
+                        &D_DFT_value);
+    break;
+  case 10:
+    reorder_1024<params>(s_input, &A_DFT_value, &B_DFT_value, &C_DFT_value,
+                         &D_DFT_value);
+    break;
+  // case 11:
+  //	reorder_2048<params, opt>(s_input, &A_DFT_value, &B_DFT_value,
+  //&C_DFT_value, &D_DFT_value); 	break;
+  default:
+    break;
+  }
+
+  //----> FFT
+  PoT = 1;
+  PoTp1 = 2;
+
+  //--> First iteration
+  itemp = local_id & 1;
+  parity = (1 - itemp * 2);
+
+  A_DFT_value.x = parity * A_DFT_value.x + shfl_xor(&A_DFT_value.x, 1);
+  A_DFT_value.y = parity * A_DFT_value.y + shfl_xor(&A_DFT_value.y, 1);
+  B_DFT_value.x = parity * B_DFT_value.x + shfl_xor(&B_DFT_value.x, 1);
+  B_DFT_value.y = parity * B_DFT_value.y + shfl_xor(&B_DFT_value.y, 1);
+  C_DFT_value.x = parity * C_DFT_value.x + shfl_xor(&C_DFT_value.x, 1);
+  C_DFT_value.y = parity * C_DFT_value.y + shfl_xor(&C_DFT_value.y, 1);
+  D_DFT_value.x = parity * D_DFT_value.x + shfl_xor(&D_DFT_value.x, 1);
+  D_DFT_value.y = parity * D_DFT_value.y + shfl_xor(&D_DFT_value.y, 1);
+
+  //--> Second through Fifth iteration (no synchronization)
+  PoT = 2;
+  PoTp1 = 4;
+  for (q = 1; q < 5; q++) {
+    m_param = (local_id & (PoTp1 - 1));
+    itemp = m_param >> q;
+    parity = ((itemp << 1) - 1);
+    if (params::fft_direction)
+      W = Get_W_value_inverse((q - 1) * 257 + itemp * m_param);
+    else
+      W = _gTwiddles[(q - 1) * 257 + itemp * m_param];
+    Aftemp.x = W.x * A_DFT_value.x - W.y * A_DFT_value.y;
+    Aftemp.y = W.x * A_DFT_value.y + W.y * A_DFT_value.x;
+    Bftemp.x = W.x * B_DFT_value.x - W.y * B_DFT_value.y;
+    Bftemp.y = W.x * B_DFT_value.y + W.y * B_DFT_value.x;
+    Cftemp.x = W.x * C_DFT_value.x - W.y * C_DFT_value.y;
+    Cftemp.y = W.x * C_DFT_value.y + W.y * C_DFT_value.x;
+    Dftemp.x = W.x * D_DFT_value.x - W.y * D_DFT_value.y;
+    Dftemp.y = W.x * D_DFT_value.y + W.y * D_DFT_value.x;
+
+    A_DFT_value.x = Aftemp.x + parity * shfl_xor(&Aftemp.x, PoT);
+    A_DFT_value.y = Aftemp.y + parity * shfl_xor(&Aftemp.y, PoT);
+    B_DFT_value.x = Bftemp.x + parity * shfl_xor(&Bftemp.x, PoT);
+    B_DFT_value.y = Bftemp.y + parity * shfl_xor(&Bftemp.y, PoT);
+    C_DFT_value.x = Cftemp.x + parity * shfl_xor(&Cftemp.x, PoT);
+    C_DFT_value.y = Cftemp.y + parity * shfl_xor(&Cftemp.y, PoT);
+    D_DFT_value.x = Dftemp.x + parity * shfl_xor(&Dftemp.x, PoT);
+    D_DFT_value.y = Dftemp.y + parity * shfl_xor(&Dftemp.y, PoT);
+
+    PoT = PoT << 1;
+    PoTp1 = PoTp1 << 1;
+  }
+
+  itemp = local_id + (warp_id << 2) * params::warp;
+  s_input[itemp] = A_DFT_value;
+  s_input[itemp + params::warp] = B_DFT_value;
+  s_input[itemp + 2 * params::warp] = C_DFT_value;
+  s_input[itemp + 3 * params::warp] = D_DFT_value;
+
+  for (q = 5; q < (params::log2_degree - 1); q++) {
+    __syncthreads();
+    m_param = threadIdx.x & (PoT - 1);
+    j = threadIdx.x >> q;
+
+    if (params::fft_direction)
+      W = Get_W_value_inverse((q - 1) * 257 + m_param);
+    else
+      W = _gTwiddles[(q - 1) * 257 + m_param];
+
+    A_read_index = j * (PoTp1 << 1) + m_param;
+    B_read_index = j * (PoTp1 << 1) + m_param + PoT;
+    C_read_index = j * (PoTp1 << 1) + m_param + PoTp1;
+    D_read_index = j * (PoTp1 << 1) + m_param + 3 * PoT;
+
+    Aftemp = s_input[A_read_index];
+    Bftemp = s_input[B_read_index];
+    A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
+    A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
+    B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
+    B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
+
+    Cftemp = s_input[C_read_index];
+    Dftemp = s_input[D_read_index];
+    C_DFT_value.x = Cftemp.x + W.x * Dftemp.x - W.y * Dftemp.y;
+    C_DFT_value.y = Cftemp.y + W.x * Dftemp.y + W.y * Dftemp.x;
+    D_DFT_value.x = Cftemp.x - W.x * Dftemp.x + W.y * Dftemp.y;
+    D_DFT_value.y = Cftemp.y - W.x * Dftemp.y - W.y * Dftemp.x;
+
+    s_input[A_read_index] = A_DFT_value;
+    s_input[B_read_index] = B_DFT_value;
+    s_input[C_read_index] = C_DFT_value;
+    s_input[D_read_index] = D_DFT_value;
+
+    PoT = PoT << 1;
+    PoTp1 = PoTp1 << 1;
+  }
+
+  // last iteration
+  if (params::log2_degree > 6) {
+    __syncthreads();
+    m_param = threadIdx.x;
+
+    if (params::fft_direction)
+      W = Get_W_value_inverse((q - 1) * 257 + m_param);
+    else
+      W = _gTwiddles[(q - 1) * 257 + m_param];
+
+    A_read_index = m_param;
+    B_read_index = m_param + PoT;
+    C_read_index = m_param + (PoT >> 1);
+    D_read_index = m_param + 3 * (PoT >> 1);
+
+    Aftemp = s_input[A_read_index];
+    Bftemp = s_input[B_read_index];
+    A_DFT_value.x = Aftemp.x + W.x * Bftemp.x - W.y * Bftemp.y;
+    A_DFT_value.y = Aftemp.y + W.x * Bftemp.y + W.y * Bftemp.x;
+    B_DFT_value.x = Aftemp.x - W.x * Bftemp.x + W.y * Bftemp.y;
+    B_DFT_value.y = Aftemp.y - W.x * Bftemp.y - W.y * Bftemp.x;
+
+    Cftemp = s_input[C_read_index];
+    Dftemp = s_input[D_read_index];
+    C_DFT_value.x = Cftemp.x + W.y * Dftemp.x + W.x * Dftemp.y;
+    C_DFT_value.y = Cftemp.y + W.y * Dftemp.y - W.x * Dftemp.x;
+    D_DFT_value.x = Cftemp.x - W.y * Dftemp.x - W.x * Dftemp.y;
+    D_DFT_value.y = Cftemp.y - W.y * Dftemp.y + W.x * Dftemp.x;
+
+    s_input[A_read_index] = A_DFT_value;
+    s_input[B_read_index] = B_DFT_value;
+    s_input[C_read_index] = C_DFT_value;
+    s_input[D_read_index] = D_DFT_value;
+  }
+}
+
+template <class params>
+__global__ void SMFFT_DIT_external(double2 *d_input, double2 *d_output) {
+  __syncthreads();
+
+  extern __shared__ double2 sharedmemBSK[];
+
+  double2 *s_input = sharedmemBSK;
+
+  int cTid = threadIdx.x * params::opt;
+#pragma unroll
+  for (int i = 0; i < params::opt; i++) {
+    double2 tmp;
+    switch (params::degree) {
+    case 512:
+      tmp = INVERSE_TWIDDLES_512[cTid];
+      tmp.x *= params::degree;
+      tmp.y *= -params::degree;
+      break;
+    case 1024:
+      tmp = gTwiddles1024[cTid];
+      break;
+    default:
+      break;
+    }
+
+    d_input[blockIdx.x * params::degree + cTid] *= tmp;
+    cTid++;
+  }
+
+  __syncthreads();
+
+  s_input[threadIdx.x] = d_input[threadIdx.x + blockIdx.x * params::degree];
+  s_input[threadIdx.x + params::quarter] =
+      d_input[threadIdx.x + blockIdx.x * params::degree + params::quarter];
+  s_input[threadIdx.x + params::half] =
+      d_input[threadIdx.x + blockIdx.x * params::degree + params::half];
+  s_input[threadIdx.x + params::three_quarters] =
+      d_input[threadIdx.x + blockIdx.x * params::degree +
+              params::three_quarters];
+
+  __syncthreads();
+
+  do_SMFFT_CT_DIT<params>(s_input);
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    for (int i = 0; i < 1024; i++)
+      printf("smfft[%u] %.10f %.10f\n", i, s_input[i].x, s_input[i].y);
+  }
+  __syncthreads();
+
+
+
+  __syncthreads();
+  d_output[threadIdx.x + blockIdx.x * params::degree] = s_input[threadIdx.x];
+  d_output[threadIdx.x + blockIdx.x * params::degree + params::quarter] =
+      s_input[threadIdx.x + params::quarter];
+  d_output[threadIdx.x + blockIdx.x * params::degree + params::half] =
+      s_input[threadIdx.x + params::half];
+  d_output[threadIdx.x + blockIdx.x * params::degree + params::three_quarters] =
+      s_input[threadIdx.x + params::three_quarters];
+
+  __syncthreads();
+}
+
+#endif // GPU_BOOTSTRAP_SMFFT_CUH
+
+*/
--- a/src/fft/twiddles.cu
+++ b/src/fft/twiddles.cu
--- a/src/fft/twiddles.cuh
+++ b/src/fft/twiddles.cuh
@@ -0,0 +1,28 @@
+
+#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
+#define GPU_BOOTSTRAP_TWIDDLES_CUH
+
+// TODO (Agnes) depending on the device architecture
+// can we make more of them __constant__?
+// Do we have to define them all regardless of the
+// polynomial degree and q values?
+
+// TODO (Beka) make those two arrays with dynamic size
+// or find exact maximum for 8192 length poly it shuld
+// be less than 2048
+extern __constant__ short SW1[2048];
+extern __constant__ short SW2[2048];
+
+extern __constant__ double2 negTwids3[4];
+extern __constant__ double2 negTwids4[8];
+extern __constant__ double2 negTwids5[16];
+extern __constant__ double2 negTwids6[32];
+extern __constant__ double2 negTwids7[64];
+extern __constant__ double2 negTwids8[128];
+extern __constant__ double2 negTwids9[256];
+extern __constant__ double2 negTwids10[512];
+extern __constant__ double2 negTwids11[1024];
+extern __device__ double2 negTwids12[2048];
+extern __device__ double2 negTwids13[4096];
+
+#endif
--- a/src/keyswitch.cu
+++ b/src/keyswitch.cu
@@ -0,0 +1,55 @@
+#include "keyswitch.cuh"
+#include "keyswitch.h"
+#include "polynomial/parameters.cuh"
+
+#include <cstdint>
+
+/* Perform keyswitch on a batch of input LWE ciphertexts for 32 bits
+ *
+ *  - lwe_out: output batch of num_samples keyswitched ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lwe_in: input batch of num_samples LWE ciphertexts, containing n
+ *            mask values + 1 body value
+ *
+ * This function calls a wrapper to a device kernel that performs the keyswitch
+ * 	- num_samples blocks of threads are launched
+ */
+void cuda_keyswitch_lwe_ciphertext_vector_32(void *v_stream, void *lwe_out, void *lwe_in,
+                        void *ksk,
+                        uint32_t lwe_dimension_before,
+                        uint32_t lwe_dimension_after,
+                        uint32_t base_log, uint32_t l_gadget,
+                        uint32_t num_samples) {
+    cuda_keyswitch_lwe_ciphertext_vector(
+            v_stream, static_cast<uint32_t *>(lwe_out), static_cast<uint32_t *>(lwe_in),
+            static_cast<uint32_t*>(ksk),
+            lwe_dimension_before, lwe_dimension_after,
+            base_log, l_gadget,
+            num_samples);
+}
+
+/* Perform keyswitch on a batch of input LWE ciphertexts for 64 bits
+ *
+ *  - lwe_out: output batch of num_samples keyswitched ciphertexts c =
+ * (a0,..an-1,b) where n is the LWE dimension
+ *  - lwe_in: input batch of num_samples LWE ciphertexts, containing n
+ *            mask values + 1 body value
+ *
+ * This function calls a wrapper to a device kernel that performs the keyswitch
+ * 	- num_samples blocks of threads are launched
+ */
+void cuda_keyswitch_lwe_ciphertext_vector_64(void *v_stream, void *lwe_out, void *lwe_in,
+                        void *ksk,
+                        uint32_t lwe_dimension_before,
+                        uint32_t lwe_dimension_after,
+                        uint32_t base_log, uint32_t l_gadget,
+                        uint32_t num_samples) {
+    cuda_keyswitch_lwe_ciphertext_vector(
+            v_stream, static_cast<uint64_t *>(lwe_out), static_cast<uint64_t *> (lwe_in),
+            static_cast<uint64_t*>(ksk),
+            lwe_dimension_before, lwe_dimension_after,
+            base_log, l_gadget,
+            num_samples);
+}
+
+
--- a/src/keyswitch.cuh
+++ b/src/keyswitch.cuh
@@ -0,0 +1,146 @@
+#ifndef CNCRT_KS_H
+#define CNCRT_KS_H
+
+#include "crypto/gadget.cuh"
+#include "crypto/torus.cuh"
+#include "polynomial/polynomial.cuh"
+#include <thread>
+#include <vector>
+
+template <typename Torus>
+__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
+                            uint32_t lwe_dimension_after,
+                            uint32_t l_gadget) {
+    int pos = i * l_gadget * (lwe_dimension_after + 1) +
+              level * (lwe_dimension_after + 1);
+    Torus *ptr = &ksk[pos];
+    return ptr;
+}
+
+/*
+ * keyswitch kernel
+ * Each thread handles a piece of the following equation:
+ * $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
+ * (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
+ * the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
+ * equation is solved for each polynomial coefficient. where Dec denotes the
+ * decomposition with base beta and l levels and the inner product is done
+ * between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
+ * with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
+ * scaling factor) under key s2 instead of s1, with an increased noise
+ *
+ */
+template <typename Torus>
+__global__ void keyswitch(Torus *lwe_out, Torus *lwe_in,
+                          Torus *ksk,
+                          uint32_t lwe_dimension_before,
+                          uint32_t lwe_dimension_after,
+                          uint32_t base_log,
+                          uint32_t l_gadget,
+                          int lwe_lower, int lwe_upper, int cutoff) {
+  int tid = threadIdx.x;
+
+  extern __shared__ char sharedmem[];
+
+  Torus *local_lwe_out = (Torus *)sharedmem;
+
+  auto block_lwe_in =
+      get_chunk(lwe_in, blockIdx.x, lwe_dimension_before + 1);
+  auto block_lwe_out =
+      get_chunk(lwe_out, blockIdx.x, lwe_dimension_after + 1);
+
+  auto gadget = GadgetMatrixSingle<Torus>(base_log, l_gadget);
+
+  int lwe_part_per_thd;
+  if (tid < cutoff) {
+    lwe_part_per_thd = lwe_upper;
+  } else {
+    lwe_part_per_thd = lwe_lower;
+  }
+  __syncthreads();
+
+  for (int k = 0; k < lwe_part_per_thd; k++) {
+    int idx = tid + k * blockDim.x;
+    local_lwe_out[idx] = 0;
+  }
+
+  if (tid == 0) {
+    local_lwe_out[lwe_dimension_after] =
+        block_lwe_in[lwe_dimension_before];
+  }
+
+  for (int i = 0; i < lwe_dimension_before; i++) {
+
+    __syncthreads();
+
+    Torus a_i = round_to_closest_multiple(block_lwe_in[i], base_log,
+                                          l_gadget);
+
+    for (int j = 0; j < l_gadget; j++) {
+      auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_after, l_gadget);
+      Torus decomposed = gadget.decompose_one_level_single(a_i, (uint32_t)j);
+      for (int k = 0; k < lwe_part_per_thd; k++) {
+        int idx = tid + k * blockDim.x;
+        local_lwe_out[idx] -= (Torus)ksk_block[idx] * decomposed;
+      }
+    }
+  }
+
+  for (int k = 0; k < lwe_part_per_thd; k++) {
+    int idx = tid + k * blockDim.x;
+    block_lwe_out[idx] = local_lwe_out[idx];
+  }
+}
+
+/// assume lwe_in in the gpu
+template <typename Torus>
+__host__ void cuda_keyswitch_lwe_ciphertext_vector(void *v_stream, Torus *lwe_out, Torus *lwe_in,
+                                   Torus *ksk,
+                                   uint32_t lwe_dimension_before,
+                                   uint32_t lwe_dimension_after,
+                                   uint32_t base_log,
+                                   uint32_t l_gadget,
+                                   uint32_t num_samples) {
+
+  constexpr int ideal_threads = 128;
+
+  int lwe_dim = lwe_dimension_after + 1;
+  int lwe_lower, lwe_upper, cutoff;
+  if (lwe_dim % ideal_threads == 0) {
+    lwe_lower = lwe_dim / ideal_threads;
+    lwe_upper = lwe_dim / ideal_threads;
+    cutoff = 0;
+  } else {
+    int y =
+        ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
+    cutoff = ideal_threads - y;
+    lwe_lower = lwe_dim / ideal_threads;
+    lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
+  }
+
+//  int lwe_size_before =
+//      (lwe_dimension_before + 1) * num_samples;
+  int lwe_size_after =
+      (lwe_dimension_after + 1) * num_samples;
+
+  int shared_mem =
+      sizeof(Torus) * (lwe_dimension_after + 1);
+
+  cudaMemset(lwe_out, 0, sizeof(Torus) * lwe_size_after);
+
+  dim3 grid(num_samples, 1, 1);
+  dim3 threads(ideal_threads, 1, 1);
+
+  cudaFuncSetAttribute(keyswitch<Torus>,
+                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
+
+  auto stream = static_cast<cudaStream_t *>(v_stream);
+  keyswitch<<<grid, threads, shared_mem, *stream>>>(
+      lwe_out, lwe_in, ksk, lwe_dimension_before, lwe_dimension_after, base_log,
+      l_gadget, lwe_lower, lwe_upper, cutoff);
+
+  cudaStreamSynchronize(*stream);
+
+}
+
+#endif
--- a/src/polynomial/functions.cuh
+++ b/src/polynomial/functions.cuh
@@ -0,0 +1,261 @@
+#ifndef GPU_POLYNOMIAL_FUNCTIONS
+#define GPU_POLYNOMIAL_FUNCTIONS
+#include "utils/memory.cuh"
+#include "utils/timer.cuh"
+
+/*
+ *  function compresses decomposed buffer into half size complex buffer for fft
+ */
+template <typename T, class params>
+__device__ void real_to_complex_compressed(T *src, double2 *dst) {
+  int tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    dst[tid].x = (double)src[2 * tid];
+    dst[tid].y = (double)src[2 * tid + 1];
+    tid += params::degree / params::opt;
+  }
+}
+
+/*
+ * copy source polynomial to specific slice of batched polynomials
+ * used only in low latency version
+ */
+template <typename T, class params>
+__device__ void copy_into_ith_polynomial_low_lat(T *source, T *dst, int i) {
+  int tid = threadIdx.x;
+  int begin = i * (params::degree / 2 + 1);
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    dst[tid + begin] = source[tid];
+    tid = tid + params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    dst[params::degree / 2 + begin] = source[params::degree / 2];
+  }
+}
+
+/*
+ * accumulates source polynomial into specific slice of batched polynomial
+ * used only in low latency version
+ */
+template <typename T, class params>
+__device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
+  int tid = threadIdx.x;
+  int begin = p_id * (params::degree / 2 + 1);
+#pragma unroll
+  for (int i = 0; i < params::opt / 2; i++) {
+    dst[tid] += source[tid + begin];
+    tid = tid + params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    dst[params::degree / 2] += source[params::degree / 2 + begin];
+  }
+}
+
+/*
+ * Performs acc = acc * (X^ä + 1) if zeroAcc = false
+ * Performs acc = 0 if zeroAcc
+ * takes single buffer and calculates inplace.
+ */
+template <typename T, int elems_per_thread, int block_size>
+__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
+                                                      uint32_t j,
+                                                      bool zeroAcc) {
+  int tid = threadIdx.x;
+  constexpr int degree = block_size * elems_per_thread;
+  if (zeroAcc) {
+    for (int i = 0; i < elems_per_thread; i++) {
+      accumulator[tid] = 0;
+      tid += block_size;
+    }
+  } else {
+    tid = threadIdx.x;
+    for (int i = 0; i < elems_per_thread; i++) {
+      if (j < degree) {
+        if (tid < degree - j) {
+          accumulator[tid] = input[tid + j];
+        } else {
+          accumulator[tid] = -input[tid - degree + j];
+        }
+      } else {
+        uint32_t jj = j - degree;
+        if (tid < degree - jj) {
+          accumulator[tid] = -input[tid + jj];
+        } else {
+          accumulator[tid] = input[tid - degree + jj];
+        }
+      }
+      tid += block_size;
+    }
+  }
+}
+
+/*
+ * Performs result_acc = acc * (X^ä - 1) - acc
+ * takes single buffer as input returns single rotated buffer
+ */
+template <typename T, int elems_per_thread, int block_size>
+__device__ void
+multiply_by_monomial_negacyclic_and_sub_polynomial(T *acc, T *result_acc,
+                                                   uint32_t j) {
+  int tid = threadIdx.x;
+  constexpr int degree = block_size * elems_per_thread;
+  for (int i = 0; i < elems_per_thread; i++) {
+    if (j < degree) {
+      if (tid < j) {
+        result_acc[tid] = -acc[tid - j + degree] - acc[tid];
+      } else {
+        result_acc[tid] = acc[tid - j] - acc[tid];
+      }
+    } else {
+      uint32_t jj = j - degree;
+      if (tid < jj) {
+        result_acc[tid] = acc[tid - jj + degree] - acc[tid];
+
+      } else {
+        result_acc[tid] = -acc[tid - jj] - acc[tid];
+      }
+    }
+    tid += block_size;
+  }
+}
+
+/*
+ * performs a rounding to increase accuracy of the PBS
+ * calculates inplace.
+ */
+template <typename T, int elems_per_thread, int block_size>
+__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
+                                                  int l_gadget) {
+  int tid = threadIdx.x;
+  for (int i = 0; i < elems_per_thread; i++) {
+
+    T x_acc = rotated_acc[tid];
+    T shift = sizeof(T) * 8 - l_gadget * base_log;
+    T mask = 1ll << (shift - 1);
+    T b_acc = (x_acc & mask) >> (shift - 1);
+    T res_acc = x_acc >> shift;
+    res_acc += b_acc;
+    res_acc <<= shift;
+    rotated_acc[tid] = res_acc;
+    tid = tid + block_size;
+  }
+}
+
+template <typename Torus, class params>
+__device__ void add_to_torus(double2 *m_values, Torus *result) {
+  Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
+  int tid = threadIdx.x;
+#pragma unroll
+  // TODO (Beka) check if better memory access is possible
+  for (int i = 0; i < params::opt / 2; i++) {
+    double v1 = m_values[tid].x;
+    double v2 = m_values[tid].y;
+
+    double frac = v1 - floor(v1);
+    frac *= mx;
+    double carry = frac - floor(frac);
+    double zero_point_five = 1. / 2.;
+    if (carry >= zero_point_five)
+      frac++;
+
+
+    Torus V1 = typecast_double_to_torus<Torus>(frac);
+
+    frac = v2 - floor(v2);
+    frac *= mx;
+    carry = frac - floor(v2);
+    if (carry >= zero_point_five)
+      frac++;
+
+    Torus V2 = typecast_double_to_torus<Torus>(frac);
+
+    result[tid * 2] += V1;
+    result[tid * 2 + 1] += V2;
+    tid = tid + params::degree / params::opt;
+  }
+}
+
+template <typename Torus, class params>
+__device__ void sample_extract_body(Torus *lwe_out, Torus *accumulator) {
+  // Set first coefficient of the accumulator as the body of the LWE sample
+  // todo(Joao): not every thread needs to set it
+  // if (threadIdx.x == 0)
+  lwe_out[params::degree] = accumulator[0];
+}
+
+template <typename Torus, class params>
+__device__ void sample_extract_mask(Torus *lwe_out, Torus *accumulator) {
+  // Set ACC = -ACC
+  // accumulator.negate_inplace();
+
+  int tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt; i++) {
+    accumulator[tid] = -accumulator[tid];
+    tid = tid + params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+
+  // Reverse the accumulator
+  // accumulator.reverse_inplace();
+
+  tid = threadIdx.x;
+  Torus result[params::opt];
+#pragma unroll
+  for (int i = 0; i < params::opt; i++) {
+    result[i] = accumulator[params::degree - tid - 1];
+    tid = tid + params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt; i++) {
+    accumulator[tid] = result[i];
+    tid = tid + params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+
+  // Perform ACC * X
+  // accumulator.multiply_by_monomial_negacyclic_inplace(1);
+
+  tid = threadIdx.x;
+  result[params::opt];
+  for (int i = 0; i < params::opt; i++) {
+    if (1 < params::degree) {
+      if (tid < 1)
+        result[i] = -accumulator[tid - 1 + params::degree];
+      else
+        result[i] = accumulator[tid - 1];
+    } else {
+      uint32_t jj = 1 - (uint32_t)params::degree;
+      if (tid < jj)
+        result[i] = accumulator[tid - jj + params::degree];
+      else
+        result[i] = -accumulator[tid - jj];
+    }
+    tid += params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+  tid = threadIdx.x;
+  for (int i = 0; i < params::opt; i++) {
+    accumulator[tid] = result[i];
+    tid += params::degree / params::opt;
+  }
+  synchronize_threads_in_block();
+
+  // Copy to the mask of the LWE sample
+  // accumulator.copy_into(lwe_out);
+
+  tid = threadIdx.x;
+#pragma unroll
+  for (int i = 0; i < params::opt; i++) {
+    lwe_out[tid] = accumulator[tid];
+    tid = tid + params::degree / params::opt;
+  }
+}
+
+#endif
--- a/src/polynomial/parameters.cuh
+++ b/src/polynomial/parameters.cuh
@@ -0,0 +1,78 @@
+#ifndef CNCRT_PARAMETERS_H
+#define CNCRT_PARAMETERS_H
+
+constexpr int log2(int n) { return (n <= 2) ? 1 : 1 + log2(n / 2); }
+
+constexpr int choose_opt(int degree) {
+  if (degree <= 1024)
+    return 4;
+  else if (degree == 2048)
+    return 8;
+  else if (degree == 4096)
+    return 16;
+  else
+    return 32;
+}
+template <class params> class HalfDegree {
+public:
+  constexpr static int degree = params::degree / 2;
+  constexpr static int opt = params::opt / 2;
+  constexpr static int log2_degree = params::log2_degree - 1;
+  constexpr static int quarter = params::quarter / 2;
+  constexpr static int half = params::half / 2;
+  constexpr static int three_quarters = quarter + half;
+  constexpr static int warp = 32;
+  constexpr static int fft_sm_required = degree + degree / warp;
+};
+
+template <int N> class Degree {
+public:
+  constexpr static int degree = N;
+  constexpr static int opt = choose_opt(N);
+  constexpr static int log2_degree = log2(N);
+  constexpr static int quarter = N / 4;
+  constexpr static int half = N / 2;
+  constexpr static int three_quarters = half + quarter;
+  constexpr static int warp = 32;
+  constexpr static int fft_sm_required = N + 32;
+};
+
+enum sharedMemDegree {
+  NOSM = 0,
+  PARTIALSM = 1,
+  FULLSM = 2
+
+};
+
+class ForwardFFT {
+public:
+  constexpr static int direction = 0;
+};
+
+class BackwardFFT {
+public:
+  constexpr static int direction = 1;
+};
+
+class ReorderFFT {
+  constexpr static int reorder = 1;
+};
+class NoReorderFFT {
+  constexpr static int reorder = 0;
+};
+
+template <class params, class direction, class reorder = ReorderFFT>
+class FFTDegree : public params {
+public:
+  constexpr static int fft_direction = direction::direction;
+  constexpr static int fft_reorder = reorder::reorder;
+};
+
+template <int N, class direction, class reorder = ReorderFFT>
+class FFTParams : public Degree<N> {
+public:
+  constexpr static int fft_direction = direction::direction;
+  constexpr static int fft_reorder = reorder::reorder;
+};
+
+#endif // CNCRT_PARAMETERS_H
--- a/src/polynomial/polynomial.cuh
+++ b/src/polynomial/polynomial.cuh
@@ -0,0 +1,668 @@
+#ifndef CNCRT_POLYNOMIAL_H
+#define CNCRT_POLYNOMIAL_H
+
+#include "complex/operations.cuh"
+#include "crypto/torus.cuh"
+#include "fft/bnsmfft.cuh"
+#include "fft/smfft.cuh"
+#include "parameters.cuh"
+#include "utils/memory.cuh"
+#include "utils/timer.cuh"
+#include <cassert>
+#include <cstdint>
+
+#define PI 3.141592653589793238462643383279502884197
+
+template <typename T>
+__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
+  int pos = chunk_num * chunk_size;
+  T *ptr = &data[pos];
+  return ptr;
+}
+
+class ExtraMemory {
+public:
+  uint32_t m_size;
+  __device__ ExtraMemory(uint32_t size) : m_size(size) {}
+};
+template <typename T, class params> class PolynomialFourier;
+
+template <typename T, class params> class Polynomial;
+
+template <typename T, class params> class Vector;
+
+template <typename FT, class params> class Twiddles;
+
+template <typename T, class params> class VectorPolynomial {
+public:
+  T *m_data;
+  uint32_t m_num_polynomials;
+
+  __device__ VectorPolynomial(T *data, uint32_t num_polynomials)
+      : m_data(data), m_num_polynomials(num_polynomials) {}
+
+  __device__ VectorPolynomial(SharedMemory &shmem, uint32_t num_polynomials)
+      : m_num_polynomials(num_polynomials) {
+    shmem.get_allocation(&m_data, m_num_polynomials * params::degree);
+  }
+
+  __device__ VectorPolynomial<T, params> get_chunk(int chunk_num,
+                                                   int chunk_size) {
+    int pos = chunk_num * chunk_size;
+    T *ptr = &m_data[pos];
+    // todo(Joao): unsafe, user must pass chunk that has size multiple of
+    // polynomial degree
+    return VectorPolynomial<T, params>(ptr, chunk_size / params::degree);
+  }
+
+  __host__ VectorPolynomial() {}
+
+  __host__ VectorPolynomial(DeviceMemory &dmem, uint32_t num_polynomials,
+                            int device)
+      : m_num_polynomials(num_polynomials) {
+    dmem.get_allocation(&m_data, m_num_polynomials * params::degree, device);
+  }
+
+  __host__ VectorPolynomial(DeviceMemory &dmem, T *source,
+                            uint32_t num_polynomials, int device)
+      : m_num_polynomials(num_polynomials) {
+    dmem.get_allocation_and_copy_async(
+        &m_data, source, m_num_polynomials * params::degree, device);
+  }
+
+  __host__ void copy_to_host(T *dest) {
+    cudaMemcpyAsync(dest, m_data,
+                    sizeof(T) * m_num_polynomials * params::degree,
+                    cudaMemcpyDeviceToHost);
+  }
+
+  __device__ void copy_into(Polynomial<T, params> &dest,
+                            int polynomial_number = 0) {
+    int tid = threadIdx.x;
+    int begin = polynomial_number * params::degree;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      dest.coefficients[tid] = m_data[tid + begin];
+      tid = tid + params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
+  // todo(Joao): we need to make these APIs more clear, as it's confusing what's
+  // being copied where
+  __device__ void copy_into_ith_polynomial(PolynomialFourier<T, params> &source,
+                                           int i) {
+    int tid = threadIdx.x;
+    int begin = i * (params::degree / 2 + 1);
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      this->m_data[tid + begin] = source.m_values[tid];
+      tid = tid + params::degree / params::opt;
+    }
+
+    if (threadIdx.x == 0) {
+      this->m_data[params::degree / 2 + begin] =
+          source.m_values[params::degree / 2];
+    }
+  }
+
+  __device__ void split_into_polynomials(Polynomial<T, params> &first,
+                                         Polynomial<T, params> &second) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      first.coefficients[tid] = m_data[tid];
+      second.coefficients[tid] = m_data[tid + params::degree];
+      tid = tid + params::degree / params::opt;
+    }
+  }
+};
+
+template <typename T, class params> class PolynomialFourier {
+public:
+  T *m_values;
+  uint32_t degree;
+
+  __device__ __host__ PolynomialFourier(T *m_values) : m_values(m_values) {}
+
+  __device__ PolynomialFourier(SharedMemory &shmem) : degree(degree) {
+    shmem.get_allocation(&this->m_values, params::degree);
+  }
+
+  __device__ PolynomialFourier(SharedMemory &shmem, ExtraMemory extra_memory)
+      : degree(degree) {
+    shmem.get_allocation(&this->m_values, params::degree + extra_memory.m_size);
+  }
+  __device__ PolynomialFourier(SharedMemory &shmem, uint32_t degree)
+      : degree(degree) {
+    shmem.get_allocation(&this->m_values, degree);
+  }
+
+  __host__ PolynomialFourier(DeviceMemory &dmem, int device) : degree(degree) {
+    dmem.get_allocation(&this->m_values, params::degree, device);
+  }
+
+  __device__ char *reuse_memory() { return (char *)m_values; }
+  __device__ void copy_from(PolynomialFourier<T, params> &source, int begin) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      this->m_values[tid + begin] = source.m_values[tid];
+      tid = tid + params::degree / params::opt;
+    }
+  }
+  __device__ void fill_with(T value) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      m_values[tid] = value;
+      tid += params::degree / params::opt;
+    }
+  }
+
+  /*
+  __device__ void add_polynomial_inplace(PolynomialFourier<T, params> &source,
+                                         int begin) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      this->m_values[tid] += source.m_values[tid + begin];
+      tid = tid + params::degree / params::opt;
+    }
+    if (threadIdx.x == 0) {
+      this->m_values[params::degree / 2] += source.m_values[params::degree / 2 +
+begin];
+    }
+  }
+  */
+
+  __device__ void swap_quarters_inplace() {
+    int tid = threadIdx.x;
+    int s1 = params::quarter;
+    int s2 = params::three_quarters;
+
+    T tmp = m_values[s2 + tid];
+    m_values[s2 + tid] = m_values[s1 + tid];
+    m_values[s1 + tid] = tmp;
+  }
+
+  __device__ void add_polynomial_inplace(VectorPolynomial<T, params> &source,
+                                         int polynomial_number) {
+    int tid = threadIdx.x;
+    int begin = polynomial_number * (params::degree / 2 + 1);
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      this->m_values[tid] += source.m_data[tid + begin];
+      tid = tid + params::degree / params::opt;
+    }
+
+    if (threadIdx.x == 0) {
+      this->m_values[params::degree / 2] +=
+          source.m_data[params::degree / 2 + begin];
+    }
+  }
+
+  __device__ void
+  forward_negacyclic_fft_inplace(PolynomialFourier<double2, params> &X) {
+    // TODO function should be removed
+  }
+
+  __device__ void inverse_negacyclic_fft_inplace() {
+    // TODO function should be removed
+  }
+
+  template <typename Torus>
+  __device__ void add_to_torus(Polynomial<Torus, params> &result) {
+    // TODO function should be removed
+  }
+
+  __device__ T &operator[](int i) { return m_values[i]; }
+};
+
+template <typename T, class params> class Polynomial {
+public:
+  T *coefficients;
+  uint32_t degree;
+
+  __device__ Polynomial(T *coefficients, uint32_t degree)
+      : coefficients(coefficients), degree(degree) {}
+
+  __device__ Polynomial(char *memory, uint32_t degree)
+      : coefficients((T *)memory), degree(degree) {}
+
+  __device__ Polynomial(SharedMemory &shmem, uint32_t degree) : degree(degree) {
+    shmem.get_allocation(&this->coefficients, degree);
+  }
+
+  __host__ Polynomial(DeviceMemory &dmem, uint32_t degree, int device)
+      : degree(degree) {
+    dmem.get_allocation(&this->coefficients, params::degree, device);
+  }
+
+  __host__ Polynomial(DeviceMemory &dmem, T *source, uint32_t degree,
+                      int device)
+      : degree(degree) {
+    dmem.get_allocation_and_copy_async(&this->coefficients, source,
+                                       params::degree, device);
+  }
+
+  __host__ void copy_to_host(T *dest) {
+    cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree,
+                    cudaMemcpyDeviceToHost);
+  }
+
+  __device__ T get_coefficient(int i) { return this->coefficients[i]; }
+
+  __device__ char *reuse_memory() { return (char *)coefficients; }
+
+  __device__ void copy_coefficients_from(Polynomial<T, params> &source,
+                                         int begin_dest = 0,
+                                         int begin_src = 0) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      this->coefficients[tid + begin_dest] = source.coefficients[tid];
+      tid = tid + params::degree / params::opt;
+    }
+  }
+
+  __device__ void fill_with(T value) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      coefficients[tid] = value;
+      tid += params::degree / params::opt;
+    }
+  }
+
+  __device__ void multiply_by_monomial_negacyclic(Polynomial<T, params> &result,
+                                                  uint32_t j) {
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      if (j < params::degree) {
+        if (tid < j)
+          result.coefficients[tid] =
+              -this->coefficients[tid - j + params::degree];
+        else
+          result.coefficients[tid] = this->coefficients[tid - j];
+      } else {
+        uint32_t jj = j - params::degree;
+        if (tid < jj)
+          result.coefficients[tid] =
+              this->coefficients[tid - jj + params::degree];
+        else
+          result.coefficients[tid] = -this->coefficients[tid - jj];
+      }
+      tid += params::degree / params::opt;
+    }
+  }
+
+  __device__ void multiply_by_monomial_negacyclic_inplace(uint32_t j) {
+    int tid = threadIdx.x;
+    T result[params::opt];
+    for (int i = 0; i < params::opt; i++) {
+      if (j < params::degree) {
+        if (tid < j)
+          result[i] = -this->coefficients[tid - j + params::degree];
+        else
+          result[i] = this->coefficients[tid - j];
+      } else {
+        uint32_t jj = j - params::degree;
+        if (tid < jj)
+          result[i] = this->coefficients[tid - jj + params::degree];
+        else
+          result[i] = -this->coefficients[tid - jj];
+      }
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      coefficients[tid] = result[i];
+      tid += params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
+  __device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
+      Polynomial<T, params> &result, uint32_t j) {
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      if (j < params::degree) {
+        if (tid < j)
+          result.coefficients[tid] =
+              -this->coefficients[tid - j + params::degree] -
+              this->coefficients[tid];
+        else
+          result.coefficients[tid] =
+              this->coefficients[tid - j] - this->coefficients[tid];
+      } else {
+        uint32_t jj = j - params::degree;
+        if (tid < jj)
+          result.coefficients[tid] =
+              this->coefficients[tid - jj + params::degree] -
+              this->coefficients[tid];
+        else
+          result.coefficients[tid] =
+              -this->coefficients[tid - jj] - this->coefficients[tid];
+      }
+      tid += params::degree / params::opt;
+    }
+  }
+
+  __device__ void divide_by_monomial_negacyclic(Polynomial<T, params> &result,
+                                                uint32_t j) {
+    int tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      if (j < params::degree) {
+        if (tid < params::degree - j) {
+          result.coefficients[tid] = this->coefficients[tid + j];
+        } else {
+          result.coefficients[tid] =
+              -this->coefficients[tid - params::degree + j];
+        }
+      } else {
+        uint32_t jj = j - params::degree;
+        if (tid < params::degree - jj) {
+          result.coefficients[tid] = -this->coefficients[tid + jj];
+        } else {
+          result.coefficients[tid] =
+              this->coefficients[tid - params::degree + jj];
+        }
+      }
+      tid += params::degree / params::opt;
+    }
+  }
+
+  __device__ void divide_by_monomial_negacyclic_inplace(uint32_t j) {
+    int tid = threadIdx.x;
+    T result[params::opt];
+    for (int i = 0; i < params::opt; i++) {
+      if (j < params::degree) {
+        if (tid < params::degree - j) {
+          result[i] = this->coefficients[tid + j];
+        } else {
+          result[i] = -this->coefficients[tid - params::degree + j];
+        }
+      } else {
+        uint32_t jj = j - params::degree;
+        if (tid < params::degree - jj) {
+          result[i] = -this->coefficients[tid + jj];
+        } else {
+          result[i] = this->coefficients[tid - params::degree + jj];
+        }
+      }
+      tid += params::degree / params::opt;
+    }
+    tid = threadIdx.x;
+    for (int i = 0; i < params::opt; i++) {
+      coefficients[tid] = result[i];
+      tid = tid + params::degree / params::opt;
+    }
+  }
+
+  __device__ void round_to_closest_multiple_inplace(uint32_t base_log,
+                                                    uint32_t l_gadget) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+
+      T x = coefficients[tid];
+      T shift = sizeof(T) * 8 - l_gadget * base_log;
+      T mask = 1ll << (shift - 1);
+      T b = (x & mask) >> (shift - 1);
+      T res = x >> shift;
+      res += b;
+      res <<= shift;
+      coefficients[tid] = res;
+      tid = tid + params::degree / params::opt;
+    }
+  }
+
+  __device__ void
+  to_complex_compressed(PolynomialFourier<double2, params> &dest) {
+
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt / 2; i++) {
+      dest.m_values[tid].x = (double)coefficients[2 * tid];
+      dest.m_values[tid].y = (double)coefficients[2 * tid + 1];
+      tid += params::degree / params::opt;
+    }
+  }
+
+  __device__ void to_complex(PolynomialFourier<double2, params> &dest) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      dest.m_values[tid].x = (double)coefficients[tid];
+      dest.m_values[tid].y = 0.0;
+      tid += params::degree / params::opt;
+    }
+  }
+
+  __device__ void multiply_by_scalar_inplace(T scalar) {
+    int tid = threadIdx.x;
+    const int grid_dim = blockDim.x;
+    const int slices = params::degree / grid_dim;
+    const int jump = grid_dim;
+    for (int i = 0; i < slices; i++) {
+      this->coefficients[tid] *= scalar;
+      tid += jump;
+    }
+  }
+
+  __device__ void add_scalar_inplace(T scalar) {
+    int tid = threadIdx.x;
+    const int grid_dim = blockDim.x;
+    const int slices = params::degree / grid_dim;
+    const int jump = grid_dim;
+    for (int i = 0; i < slices; i++) {
+      this->coefficients[tid] += scalar;
+      tid += jump;
+    }
+  }
+
+  __device__ void sub_scalar_inplace(T scalar) {
+    int tid = threadIdx.x;
+    const int grid_dim = blockDim.x;
+    const int slices = params::degree / grid_dim;
+    const int jump = grid_dim;
+    for (int i = 0; i < slices; i++) {
+      this->coefficients[tid] -= scalar;
+      tid += jump;
+    }
+  }
+
+  /*
+  __device__ void add_polynomial_inplace(Polynomial<T, params> &source,
+                                         int begin) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      this->coefficients[tid] += source.coefficients[tid + begin];
+      tid = tid + params::degree / params::opt;
+    }
+  }
+  */
+
+  __device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
+    int tid = threadIdx.x;
+    const int grid_dim = blockDim.x;
+    const int slices = params::degree / grid_dim;
+    const int jump = grid_dim;
+    for (int i = 0; i < slices; i++) {
+      this->coefficients[tid] -= rhs.coefficients[tid];
+      tid += jump;
+    }
+  }
+
+  __device__ void negate_inplace() {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      coefficients[tid] = -coefficients[tid];
+      tid = tid + params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
+  __device__ void copy_into(Vector<T, params> &vec) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      vec.m_data[tid] = coefficients[tid];
+      tid = tid + params::degree / params::opt;
+    }
+  }
+
+  __device__ void copy_reversed_into(Vector<T, params> &vec) {
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      vec.m_data[tid] = coefficients[params::degree - tid - 1];
+      tid = tid + params::degree / params::opt;
+    }
+  }
+
+  __device__ void reverse_inplace() {
+    int tid = threadIdx.x;
+    T result[params::opt];
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      result[i] = coefficients[params::degree - tid - 1];
+      tid = tid + params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+    tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt; i++) {
+      coefficients[tid] = result[i];
+      tid = tid + params::degree / params::opt;
+    }
+    synchronize_threads_in_block();
+  }
+
+  template <typename FT>
+  __device__ void
+  forward_negacyclic_fft_half(PolynomialFourier<FT, params> &result) {
+    // TODO function should be removed
+  }
+};
+template <typename T, class params> class Vector {
+public:
+  T *m_data;
+  uint32_t m_size;
+
+  __device__ Vector(T *elements, uint32_t size)
+      : m_data(elements), m_size(size) {}
+
+  template <typename V>
+  __device__ Vector(SharedMemory &shmem, V src, int size) : m_size(size) {
+    shmem.get_allocation(&m_data, m_size);
+    int tid = threadIdx.x;
+#pragma unroll
+    for (int i = 0; i < params::opt && tid < m_size; i++) {
+      if (tid > m_size)
+        continue;
+      m_data[tid] = src[tid];
+      tid += params::degree / params::opt;
+    }
+  }
+
+  __device__ Vector(SharedMemory &shmem, uint32_t size) : m_size(size) {
+    shmem.get_allocation(&m_data, m_size);
+  }
+
+  __host__ Vector() {}
+
+  __host__ Vector(DeviceMemory &dmem, uint32_t size, int device)
+      : m_size(size) {
+    dmem.get_allocation(&m_data, m_size, device);
+  }
+
+  __device__ T &operator[](int i) { return m_data[i]; }
+
+  __device__ Vector<T, params> get_chunk(int chunk_num, int chunk_size) {
+    int pos = chunk_num * chunk_size;
+    T *ptr = &m_data[pos];
+    return Vector<T, params>(ptr, chunk_size);
+  }
+
+  __host__ void copy_to_device(T *source, uint32_t elements) {
+    cudaMemcpyAsync(m_data, source, sizeof(T) * elements,
+                    cudaMemcpyHostToDevice);
+  }
+
+  __host__ Vector(DeviceMemory &dmem, T *source, uint32_t size_source,
+                  int device)
+      : m_size(size_source) {
+    dmem.get_allocation_and_copy_async(&m_data, source, m_size, device);
+  }
+
+  __host__ Vector(DeviceMemory &dmem, T *source, uint32_t allocation_size,
+                  uint32_t copy_size, int device)
+      : m_size(allocation_size) {
+    if (copy_size > allocation_size) {
+      printf("warning: copying more than allocation");
+    }
+    dmem.get_allocation_and_copy_async(&m_data, source, m_size, copy_size,
+                                       device);
+  }
+
+  __host__ void copy_to_host(T *dest) {
+    cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost);
+  }
+
+  __host__ void copy_to_host(T *dest, int elements) {
+    cudaMemcpyAsync(dest, m_data, sizeof(T) * elements, cudaMemcpyDeviceToHost);
+  }
+
+  __device__ T get_ith_element(int i) { return m_data[i]; }
+
+  __device__ T get_last_element() { return m_data[m_size - 1]; }
+
+  __device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
+
+  // todo(Joao): let's do coalesced access here at some point
+  __device__ void operator-=(const Vector<T, params> &rhs) {
+    assert(m_size == rhs->m_size);
+    int tid = threadIdx.x;
+    int pos = tid;
+    int total = m_size / blockDim.x + 1;
+    for (int i = 0; i < total; i++) {
+      if (pos < m_size)
+        m_data[pos] -= rhs.m_data[pos];
+      pos += blockDim.x;
+    }
+  }
+
+  __device__ void operator*=(const T &rhs) {
+    int tid = threadIdx.x;
+    int pos = tid;
+    int total = m_size / blockDim.x + 1;
+    for (int i = 0; i < total; i++) {
+      if (pos < m_size)
+        m_data[pos] *= rhs;
+      pos += blockDim.x;
+    }
+  }
+};
+
+template <typename FT, class params> class Twiddles {
+public:
+  Vector<FT, params> twiddles2, twiddles3, twiddles4, twiddles5, twiddles6,
+      twiddles7, twiddles8, twiddles9, twiddles10;
+
+  __device__
+  Twiddles(Vector<FT, params> &twiddles2, Vector<FT, params> &twiddles3,
+           Vector<FT, params> &twiddles4, Vector<FT, params> &twiddles5,
+           Vector<FT, params> &twiddles6, Vector<FT, params> &twiddles7,
+           Vector<FT, params> &twiddles8, Vector<FT, params> &twiddles9,
+           Vector<FT, params> &twiddles10)
+      : twiddles2(twiddles2), twiddles3(twiddles3), twiddles4(twiddles4),
+        twiddles5(twiddles5), twiddles6(twiddles6), twiddles7(twiddles7),
+        twiddles8(twiddles8), twiddles9(twiddles9), twiddles10(twiddles10) {}
+};
+
+#endif // CNCRT_POLYNOMIAL_H
--- a/src/polynomial/polynomial_math.cuh
+++ b/src/polynomial/polynomial_math.cuh
@@ -0,0 +1,65 @@
+#ifndef CNCRT_POLYNOMIAL_MATH_H
+#define CNCRT_POLYNOMIAL_MATH_H
+
+#include "crypto/torus.cuh"
+#include "parameters.cuh"
+#include "polynomial.cuh"
+
+template <class params, typename FT>
+__device__ void polynomial_product_in_fourier_domain(FT *result, FT *first,
+                                                     FT *second) {
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    result[tid] = first[tid] * second[tid];
+    tid += params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    result[params::degree / 2] =
+        first[params::degree / 2] * second[params::degree / 2];
+  }
+}
+
+template <class params, typename FT>
+__device__ void polynomial_product_in_fourier_domain(
+    PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
+    PolynomialFourier<FT, params> &second) {
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    result[tid] = first[tid] * second[tid];
+    tid += params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    result[params::degree / 2] =
+        first[params::degree / 2] * second[params::degree / 2];
+  }
+}
+
+template <class params, typename FT>
+__device__ void polynomial_product_accumulate_in_fourier_domain(
+    PolynomialFourier<FT, params> &result, PolynomialFourier<FT, params> &first,
+    PolynomialFourier<FT, params> &second) {
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    result[tid] += first[tid] * second[tid];
+    tid += params::degree / params::opt;
+  }
+
+  if (threadIdx.x == 0) {
+    result[params::degree / 2] +=
+        first[params::degree / 2] * second[params::degree / 2];
+  }
+}
+
+template <class params, typename FT>
+__device__ void polynomial_product_accumulate_in_fourier_domain(
+    FT *result, FT *first, PolynomialFourier<FT, params> &second) {
+  int tid = threadIdx.x;
+  for (int i = 0; i < params::opt / 2; i++) {
+    result[tid] += first[tid] * second.m_values[tid];
+    tid += params::degree / params::opt;
+  }
+}
+
+#endif // CNCRT_POLYNOMIAL_MATH_H
--- a/src/polynomial/polynomial_twiddles.cuh
+++ b/src/polynomial/polynomial_twiddles.cuh
@@ -0,0 +1 @@
+#include "polynomial.cuh"
--- a/src/types/int128.cuh
+++ b/src/types/int128.cuh
@@ -0,0 +1,76 @@
+#ifndef CNCRT_INT128_H
+#define CNCRT_INT128_H
+
+// abseil's int128 type
+// licensed under Apache license
+
+class uint128 {
+public:
+  __device__ uint128(uint64_t high, uint64_t low) : hi_(high), lo_(low) {}
+
+  uint64_t lo_;
+  uint64_t hi_;
+};
+
+class int128 {
+public:
+  int128() = default;
+
+  __device__ operator unsigned long long() const {
+    return static_cast<unsigned long long>(lo_);
+  }
+
+  __device__ int128(int64_t high, uint64_t low) : hi_(high), lo_(low) {}
+
+  uint64_t lo_;
+  int64_t hi_;
+};
+
+__device__ inline uint128 make_uint128(uint64_t high, uint64_t low) {
+  return uint128(high, low);
+}
+
+template <typename T> __device__ uint128 make_uint128_from_float(T v) {
+  if (v >= ldexp(static_cast<T>(1), 64)) {
+    uint64_t hi = static_cast<uint64_t>(ldexp(v, -64));
+    uint64_t lo = static_cast<uint64_t>(v - ldexp(static_cast<T>(hi), 64));
+    return make_uint128(hi, lo);
+  }
+
+  return make_uint128(0, static_cast<uint64_t>(v));
+}
+
+__device__ inline int128 make_int128(int64_t high, uint64_t low) {
+  return int128(high, low);
+}
+
+__device__ inline int64_t bitcast_to_signed(uint64_t v) {
+  return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
+                                 : static_cast<int64_t>(v);
+}
+
+__device__ inline uint64_t uint128_high64(uint128 v) { return v.hi_; }
+__device__ inline uint64_t uint128_low64(uint128 v) { return v.lo_; }
+
+__device__ __forceinline__ uint128 operator-(uint128 val) {
+  uint64_t hi = ~uint128_high64(val);
+  uint64_t lo = ~uint128_low64(val) + 1;
+  if (lo == 0)
+    ++hi; // carry
+  return make_uint128(hi, lo);
+}
+
+template <typename T> __device__ int128 make_int128_from_float(T v) {
+
+  // We must convert the absolute value and then negate as needed, because
+  // floating point types are typically sign-magnitude. Otherwise, the
+  // difference between the high and low 64 bits when interpreted as two's
+  // complement overwhelms the precision of the mantissa.
+  uint128 result =
+      v < 0 ? -make_uint128_from_float(-v) : make_uint128_from_float(v);
+
+  return make_int128(bitcast_to_signed(uint128_high64(result)),
+                     uint128_low64(result));
+}
+
+#endif
--- a/src/utils/kernel_dimensions.cuh
+++ b/src/utils/kernel_dimensions.cuh
@@ -0,0 +1,15 @@
+int nextPow2(int x) {
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+}
+
+void getNumBlocksAndThreads(const int n, const int maxBlockSize, int &blocks,
+                            int &threads) {
+  threads = (n < maxBlockSize * 2) ? nextPow2((n + 1) / 2) : maxBlockSize;
+  blocks = (n + threads - 1) / threads;
+}
--- a/src/utils/memory.cuh
+++ b/src/utils/memory.cuh
@@ -0,0 +1,90 @@
+#ifndef CNCRT_SHMEM_H
+#define CNCRT_SHMEM_H
+
+#include "helper_cuda.h"
+#include <atomic>
+#include <iostream>
+#include <mutex>
+#include <thread>
+#include <tuple>
+#include <vector>
+
+class SharedMemory {
+public:
+  char *m_memory_block;
+  int m_last_byte;
+
+  __device__ SharedMemory(char *ptr) : m_memory_block(ptr), m_last_byte(0) {}
+
+  template <typename T> __device__ void get_allocation(T **ptr, int elements) {
+    *ptr = (T *)(&this->m_memory_block[m_last_byte]);
+    this->m_last_byte += elements * sizeof(T);
+  }
+};
+
+class DeviceMemory {
+public:
+  std::vector<std::tuple<void *, int>> m_allocated;
+  std::mutex m_allocation_mtx;
+  std::atomic<uint32_t> m_total_devices;
+
+  DeviceMemory() : m_total_devices(1) {}
+
+  __host__ void set_device(int device) {
+    if (device > m_total_devices)
+      m_total_devices = device + 1;
+  }
+
+  template <typename T>
+  __host__ void get_allocation(T **ptr, int elements, int device) {
+    T *res;
+    cudaMalloc((void **)&res, sizeof(T) * elements);
+    *ptr = res;
+    std::lock_guard<std::mutex> lock(m_allocation_mtx);
+    m_allocated.push_back(std::make_tuple(res, device));
+  }
+
+  template <typename T>
+  __host__ void get_allocation_and_copy_async(T **ptr, T *src, int elements,
+                                              int device) {
+    T *res;
+    cudaMalloc((void **)&res, sizeof(T) * elements);
+    cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
+    *ptr = res;
+    std::lock_guard<std::mutex> lock(m_allocation_mtx);
+    m_allocated.push_back(std::make_tuple(res, device));
+  }
+
+  template <typename T>
+  __host__ void get_allocation_and_copy_async(T **ptr, T *src, int allocation,
+                                              int elements, int device) {
+    T *res;
+    cudaMalloc((void **)&res, sizeof(T) * allocation);
+    cudaMemcpyAsync(res, src, sizeof(T) * elements, cudaMemcpyHostToDevice);
+    *ptr = res;
+    std::lock_guard<std::mutex> lock(m_allocation_mtx);
+    m_allocated.push_back(std::make_tuple(res, device));
+  }
+
+  void free_all_from_device(int device) {
+    cudaSetDevice(device);
+    for (auto elem : m_allocated) {
+      auto dev = std::get<1>(elem);
+      if (dev == device) {
+        auto mem = std::get<0>(elem);
+        checkCudaErrors(cudaFree(mem));
+      }
+    }
+  }
+
+  __host__ ~DeviceMemory() {
+    for (auto elem : m_allocated) {
+      auto dev = std::get<1>(elem);
+      auto mem = std::get<0>(elem);
+      cudaSetDevice(dev);
+      checkCudaErrors(cudaFree(mem));
+    }
+  }
+};
+
+#endif // CNCRT_SHMEM_H
--- a/src/utils/timer.cuh
+++ b/src/utils/timer.cuh
@@ -0,0 +1,29 @@
+#ifndef CNCRT_TIMER_H
+#define CNCRT_TIMER_H
+
+#define synchronize_threads_in_block() __syncthreads()
+
+template <bool active> class CudaMeasureExecution {
+public:
+  cudaEvent_t m_start, m_stop;
+
+  __host__ CudaMeasureExecution() {
+    if constexpr (active) {
+      cudaEventCreate(&m_start);
+      cudaEventCreate(&m_stop);
+      cudaEventRecord(m_start);
+    }
+  }
+
+  __host__ ~CudaMeasureExecution() {
+    if constexpr (active) {
+      float ms;
+      cudaEventRecord(m_stop);
+      cudaEventSynchronize(m_stop);
+      cudaEventElapsedTime(&ms, m_start, m_stop);
+      std::cout << "Execution took " << ms << "ms" << std::endl;
+    }
+  }
+};
+
+#endif // CNCRT_TIMER_H