perf(backend-cuda): Update cuda backend to the latest tfhe-rs version (0.10.0)

2026-01-09 12:57:55 -05:00 · 2024-09-24 15:05:51 +00:00
parent 9a85d33c5b
commit ccf491e0a1
100 changed files with 183 additions and 27993 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,6 @@
 [submodule "lattice-estimator"]
 	path = third_party/lattice-estimator
 	url = https://github.com/malb/lattice-estimator
+[submodule "third_party/tfhe-rs"]
+	path = third_party/tfhe-rs
+	url = https://github.com/zama-ai/tfhe-rs.git
--- a/.linelint.yml
+++ b/.linelint.yml
@@ -4,6 +4,7 @@ autofix: false
 # list of paths to ignore, uses gitignore syntaxes (executes before any rule)
 ignore:
  - compilers/concrete-compiler/llvm-project
+  - backends/concrete-cuda/implementation

 rules:
  # checks if file ends in a newline character
--- a/backends/concrete-cuda/implementation
+++ b/backends/concrete-cuda/implementation
@@ -0,0 +1 @@
+../../third_party/tfhe-rs/backends/tfhe-cuda-backend/cuda/
--- a/backends/concrete-cuda/implementation/CMakeLists.txt
+++ b/backends/concrete-cuda/implementation/CMakeLists.txt
@@ -1,86 +0,0 @@
-cmake_minimum_required(VERSION 3.24 FATAL_ERROR)
-project(concrete_cuda LANGUAGES CXX CUDA)
-
-# See if the minimum CUDA version is available. If not, only enable documentation building.
-set(MINIMUM_SUPPORTED_CUDA_VERSION 10.0)
-include(CheckLanguage)
-# See if CUDA is available
-check_language(CUDA)
-# If so, enable CUDA to check the version.
-if(CMAKE_CUDA_COMPILER)
-  enable_language(CUDA)
-endif()
-# If CUDA is not available, or the minimum version is too low do not build
-if(NOT CMAKE_CUDA_COMPILER)
-  message(FATAL_ERROR "Cuda compiler not found.")
-endif()
-
-if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS ${MINIMUM_SUPPORTED_CUDA_VERSION})
-  message(FATAL_ERROR "CUDA ${MINIMUM_SUPPORTED_CUDA_VERSION} or greater is required for compilation.")
-endif()
-# Get CUDA compute capability
-set(OUTPUTFILE ${CMAKE_CURRENT_SOURCE_DIR}/cuda_script) # No suffix required
-set(CUDAFILE ${CMAKE_CURRENT_SOURCE_DIR}/check_cuda.cu)
-execute_process(COMMAND nvcc -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
-execute_process(
-  COMMAND ${OUTPUTFILE}
-  RESULT_VARIABLE CUDA_RETURN_CODE
-  OUTPUT_VARIABLE ARCH)
-file(REMOVE ${OUTPUTFILE})
-
-if(${CUDA_RETURN_CODE} EQUAL 0)
-  set(CUDA_SUCCESS "TRUE")
-else()
-  set(CUDA_SUCCESS "FALSE")
-endif()
-
-if(${CUDA_SUCCESS})
-  message(STATUS "CUDA Architecture: ${ARCH}")
-  message(STATUS "CUDA Version: ${CUDA_VERSION_STRING}")
-  message(STATUS "CUDA Path: ${CUDA_TOOLKIT_ROOT_DIR}")
-  message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}")
-  message(STATUS "CUDA Performance Primitives: ${CUDA_npp_LIBRARY}")
-
-  set(CUDA_NVCC_FLAGS "${ARCH}")
-  # add_definitions(-DGPU) #You may not require this
-
-else()
-  message(WARNING ${ARCH})
-endif()
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}  -g")
-if(NOT CUDA_NVCC_FLAGS)
-  set(CUDA_NVCC_FLAGS -arch=sm_70)
-endif()
-
-# in production, should use -arch=sm_70 --ptxas-options=-v to see register spills -lineinfo for better debugging
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -ccbin ${CMAKE_CXX_COMPILER} -O3 ${CUDA_NVCC_FLAGS} \
-  -std=c++17 --no-exceptions  --expt-relaxed-constexpr -rdc=true --use_fast_math -Xcompiler -fPIC")
-
-set(INCLUDE_DIR include)
-
-add_subdirectory(src)
-add_subdirectory(test_and_benchmark)
-target_include_directories(concrete_cuda PRIVATE ${INCLUDE_DIR})
-
-# This is required for rust cargo build
-install(TARGETS concrete_cuda DESTINATION .)
-install(TARGETS concrete_cuda DESTINATION lib)
-
-# Define a function to add a lint target.
-find_file(CPPLINT NAMES cpplint cpplint.exe)
-if(CPPLINT)
-  # Add a custom target to lint all child projects. Dependencies are specified in child projects.
-  add_custom_target(all_lint)
-  # Don't trigger this target on ALL_BUILD or Visual Studio 'Rebuild Solution'
-  set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_ALL TRUE)
-  # set_target_properties(all_lint PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD TRUE)
-endif()
-
-enable_testing()
--- a/backends/concrete-cuda/implementation/CPPLINT.cfg
+++ b/backends/concrete-cuda/implementation/CPPLINT.cfg
@@ -1,3 +0,0 @@
-set noparent 
-linelength=240
-filter=-legal/copyright,-readability/todo,-runtime/references,-build/c++17
--- a/backends/concrete-cuda/implementation/README.md
+++ b/backends/concrete-cuda/implementation/README.md
@@ -1,53 +0,0 @@
-# Concrete Cuda
-
-## Introduction
-
-Concrete-cuda holds the code for GPU acceleration of Zama's variant of TFHE.
-It is one of the backends of the Concrete Compiler.
-It implements CUDA/C++ functions to perform homomorphic operations on LWE ciphertexts.
-
-It provides functions to allocate memory on the GPU, to copy data back
-and forth between the CPU and the GPU, to create and destroy Cuda streams, etc.:
- `cuda_create_stream`, `cuda_destroy_stream`
- `cuda_malloc`, `cuda_check_valid_malloc`
- `cuda_memcpy_async_to_cpu`, `cuda_memcpy_async_to_gpu`
- `cuda_get_number_of_gpus`
- `cuda_synchronize_device`
-The cryptographic operations it provides are:
- an amortized implementation of the TFHE programmable bootstrap: `cuda_bootstrap_amortized_lwe_ciphertext_vector_32` and `cuda_bootstrap_amortized_lwe_ciphertext_vector_64`
- a low latency implementation of the TFHE programmable bootstrap: `cuda_bootstrap_low latency_lwe_ciphertext_vector_32` and `cuda_bootstrap_low_latency_lwe_ciphertext_vector_64`
- the keyswitch: `cuda_keyswitch_lwe_ciphertext_vector_32` and `cuda_keyswitch_lwe_ciphertext_vector_64`
- the larger precision programmable bootstrap (wop PBS, which supports up to 16 bits of message while the classical PBS only supports up to 8 bits of message) and its sub-components: `cuda_wop_pbs_64`, `cuda_extract_bits_64`, `cuda_circuit_bootstrap_64`, `cuda_cmux_tree_64`, `cuda_blind_rotation_sample_extraction_64`
- acceleration for leveled operations: `cuda_negate_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_64`, `cuda_add_lwe_ciphertext_vector_plaintext_vector_64`, `cuda_mult_lwe_ciphertext_vector_cleartext_vector`.
-
-## Dependencies
-
-**Disclaimer**: Compilation on Windows/Mac is not supported yet. Only Nvidia GPUs are supported.
-<!-- markdown-link-check-disable-next-line -->
- nvidia driver - for example, if you're running Ubuntu 20.04 check this [page](https://linuxconfig.org/how-to-install-the-nvidia-drivers-on-ubuntu-20-04-focal-fossa-linux) for installation
- [nvcc](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) >= 10.0
- [gcc](https://gcc.gnu.org/) >= 8.0 - check this [page](https://gist.github.com/ax3l/9489132) for more details about nvcc/gcc compatible versions
- [cmake](https://cmake.org/) >= 3.24
-
-## Build
-
-The Cuda project held in `concrete-cuda` can be compiled independently from Concrete in the
-following way:
-```
-git clone git@github.com:zama-ai/concrete
-cd backends/concrete-cuda/implementation
-mkdir build
-cd build
-cmake ..
-make
-```
-The compute capability is detected automatically (with the first GPU information) and set accordingly.
-
-## Links
-
- [TFHE](https://eprint.iacr.org/2018/421.pdf)
-
-## License
-
-This software is distributed under the BSD-3-Clause-Clear license. If you have any questions,
-please contact us at `hello@zama.ai`.
--- a/backends/concrete-cuda/implementation/check_cuda.cu
+++ b/backends/concrete-cuda/implementation/check_cuda.cu
@@ -1,22 +0,0 @@
-#include <stdio.h>
-
-int main(int argc, char **argv) {
-  cudaDeviceProp dP;
-  float min_cc = 3.0;
-
-  int rc = cudaGetDeviceProperties(&dP, 0);
-  if (rc != cudaSuccess) {
-    cudaError_t error = cudaGetLastError();
-    printf("CUDA error: %s", cudaGetErrorString(error));
-    return rc; /* Failure */
-  }
-  if ((dP.major + (dP.minor / 10)) < min_cc) {
-    printf("Min Compute Capability of %2.1f required:  %d.%d found\n Not "
-           "Building CUDA Code",
-           min_cc, dP.major, dP.minor);
-    return 1; /* Failure */
-  } else {
-    printf("-arch=sm_%d%d", dP.major, dP.minor);
-    return 0; /* Success */
-  }
-}
--- a/backends/concrete-cuda/implementation/format_concrete_cuda.sh
+++ b/backends/concrete-cuda/implementation/format_concrete_cuda.sh
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-find ./{include,src,test_and_benchmark} -iregex '^.*\.\(cpp\|cu\|h\|cuh\)$' -print | xargs clang-format-11 -i -style='file'
-cmake-format -i CMakeLists.txt -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py
-
-find ./{include,src,test_and_benchmark} -type f -name "CMakeLists.txt" | xargs -I % sh -c 'cmake-format -i % -c ../../../compilers/concrete-compiler/compiler/.cmake-format-config.py'
-
--- a/backends/concrete-cuda/implementation/include/bit_extraction.h
+++ b/backends/concrete-cuda/implementation/include/bit_extraction.h
@@ -1,48 +0,0 @@
-#ifndef CUDA_BIT_EXTRACT_H
-#define CUDA_BIT_EXTRACT_H
-
-#include <cstdint>
-
-extern "C" {
-
-void scratch_cuda_extract_bits_32(
-    void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t crt_decomposition_size,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void scratch_cuda_extract_bits_64(
-    void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t crt_decomposition_size,
-    uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
-                          void *list_lwe_array_out, void *lwe_array_in,
-                          int8_t *bit_extract_buffer, void *ksk,
-                          void *fourier_bsk, uint32_t *number_of_bits_array,
-                          uint32_t *delta_log_array, uint32_t lwe_dimension_in,
-                          uint32_t lwe_dimension_out, uint32_t glwe_dimension,
-                          uint32_t polynomial_size, uint32_t base_log_bsk,
-                          uint32_t level_count_bsk, uint32_t base_log_ksk,
-                          uint32_t level_count_ksk,
-                          uint32_t crt_decomposition_size,
-                          uint32_t max_shared_memory);
-
-void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
-                          void *list_lwe_array_out, void *lwe_array_in,
-                          int8_t *bit_extract_buffer, void *ksk,
-                          void *fourier_bsk, uint32_t *number_of_bits_array,
-                          uint32_t *delta_log_array, uint32_t lwe_dimension_in,
-                          uint32_t lwe_dimension_out, uint32_t glwe_dimension,
-                          uint32_t polynomial_size, uint32_t base_log_bsk,
-                          uint32_t level_count_bsk, uint32_t base_log_ksk,
-                          uint32_t level_count_ksk,
-                          uint32_t crt_decomposition_size,
-                          uint32_t max_shared_memory);
-
-void cleanup_cuda_extract_bits(void *v_stream, uint32_t gpu_index,
-                               int8_t **bit_extract_buffer);
-}
-
-#endif // CUDA_BIT_EXTRACT_H
--- a/backends/concrete-cuda/implementation/include/boolean_gates.h
+++ b/backends/concrete-cuda/implementation/include/boolean_gates.h
@@ -1,74 +0,0 @@
-#ifndef CUDA_BOOLEAN_GATES_H
-#define CUDA_BOOLEAN_GATES_H
-
-#include <cstdint>
-
-extern "C" {
-
-void cuda_boolean_not_32(void *v_stream, uint32_t gpu_index,
-                         void *lwe_array_out, void *lwe_array_in,
-                         uint32_t input_lwe_dimension,
-                         uint32_t input_lwe_ciphertext_count);
-
-void cuda_boolean_and_32(void *v_stream, uint32_t gpu_index,
-                         void *lwe_array_out, void *lwe_array_in_1,
-                         void *lwe_array_in_2, void *bootstrapping_key,
-                         void *ksk, uint32_t input_lwe_dimension,
-                         uint32_t glwe_dimension, uint32_t polynomial_size,
-                         uint32_t pbs_base_log, uint32_t pbs_level_count,
-                         uint32_t ks_base_log, uint32_t ks_level_count,
-                         uint32_t input_lwe_ciphertext_count,
-                         uint32_t max_shared_memory);
-
-void cuda_boolean_nand_32(void *v_stream, uint32_t gpu_index,
-                          void *lwe_array_out, void *lwe_array_in_1,
-                          void *lwe_array_in_2, void *bootstrapping_key,
-                          void *ksk, uint32_t input_lwe_dimension,
-                          uint32_t glwe_dimension, uint32_t polynomial_size,
-                          uint32_t pbs_base_log, uint32_t pbs_level_count,
-                          uint32_t ks_base_log, uint32_t ks_level_count,
-                          uint32_t input_lwe_ciphertext_count,
-                          uint32_t max_shared_memory);
-
-void cuda_boolean_nor_32(void *v_stream, uint32_t gpu_index,
-                         void *lwe_array_out, void *lwe_array_in_1,
-                         void *lwe_array_in_2, void *bootstrapping_key,
-                         void *ksk, uint32_t input_lwe_dimension,
-                         uint32_t glwe_dimension, uint32_t polynomial_size,
-                         uint32_t pbs_base_log, uint32_t pbs_level_count,
-                         uint32_t ks_base_log, uint32_t ks_level_count,
-                         uint32_t input_lwe_ciphertext_count,
-                         uint32_t max_shared_memory);
-
-void cuda_boolean_or_32(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-                        void *lwe_array_in_1, void *lwe_array_in_2,
-                        void *bootstrapping_key, void *ksk,
-                        uint32_t input_lwe_dimension, uint32_t glwe_dimension,
-                        uint32_t polynomial_size, uint32_t pbs_base_log,
-                        uint32_t pbs_level_count, uint32_t ks_base_log,
-                        uint32_t ks_level_count,
-                        uint32_t input_lwe_ciphertext_count,
-                        uint32_t max_shared_memory);
-
-void cuda_boolean_xor_32(void *v_stream, uint32_t gpu_index,
-                         void *lwe_array_out, void *lwe_array_in_1,
-                         void *lwe_array_in_2, void *bootstrapping_key,
-                         void *ksk, uint32_t input_lwe_dimension,
-                         uint32_t glwe_dimension, uint32_t polynomial_size,
-                         uint32_t pbs_base_log, uint32_t pbs_level_count,
-                         uint32_t ks_base_log, uint32_t ks_level_count,
-                         uint32_t input_lwe_ciphertext_count,
-                         uint32_t max_shared_memory);
-
-void cuda_boolean_xnor_32(void *v_stream, uint32_t gpu_index,
-                          void *lwe_array_out, void *lwe_array_in_1,
-                          void *lwe_array_in_2, void *bootstrapping_key,
-                          void *ksk, uint32_t input_lwe_dimension,
-                          uint32_t glwe_dimension, uint32_t polynomial_size,
-                          uint32_t pbs_base_log, uint32_t pbs_level_count,
-                          uint32_t ks_base_log, uint32_t ks_level_count,
-                          uint32_t input_lwe_ciphertext_count,
-                          uint32_t max_shared_memory);
-}
-
-#endif // CUDA_BOOLAN_GATES_H
--- a/backends/concrete-cuda/implementation/include/bootstrap.h
+++ b/backends/concrete-cuda/implementation/include/bootstrap.h
@@ -1,184 +0,0 @@
-#ifndef CUDA_BOOTSTRAP_H
-#define CUDA_BOOTSTRAP_H
-
-#include <cstdint>
-
-extern "C" {
-void cuda_fourier_polynomial_mul(void *input1, void *input2, void *output,
-                                 void *v_stream, uint32_t gpu_index,
-                                 uint32_t polynomial_size,
-                                 uint32_t total_polynomials);
-
-void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
-                                       uint32_t gpu_index,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size);
-
-void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
-                                       uint32_t gpu_index,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size);
-
-void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
-                                         int8_t **pbs_buffer,
-                                         uint32_t glwe_dimension,
-                                         uint32_t polynomial_size,
-                                         uint32_t input_lwe_ciphertext_count,
-                                         uint32_t max_shared_memory,
-                                         bool allocate_gpu_memory);
-
-void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
-                                         int8_t **pbs_buffer,
-                                         uint32_t glwe_dimension,
-                                         uint32_t polynomial_size,
-                                         uint32_t input_lwe_ciphertext_count,
-                                         uint32_t max_shared_memory,
-                                         bool allocate_gpu_memory);
-
-void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
-
-void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
-
-void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
-                                      int8_t **pbs_buffer);
-
-void scratch_cuda_bootstrap_low_latency_32(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_bootstrap_low_latency_64(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
-
-void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
-
-void cleanup_cuda_bootstrap_low_latency(void *v_stream, uint32_t gpu_index,
-                                        int8_t **pbs_buffer);
-
-void scratch_cuda_circuit_bootstrap_vertical_packing_32(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
-    uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count_cbs,
-    uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_circuit_bootstrap_vertical_packing_64(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
-    uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t level_count_cbs,
-    uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_wop_pbs_32(void *v_stream, uint32_t gpu_index,
-                             int8_t **wop_pbs_buffer, uint32_t *delta_log_array,
-                             uint32_t *cbs_delta_log, uint32_t glwe_dimension,
-                             uint32_t lwe_dimension, uint32_t polynomial_size,
-                             uint32_t level_count_cbs, uint32_t level_count_bsk,
-                             uint32_t *number_of_bits_to_extract_array,
-                             uint32_t crt_decomposition_size,
-                             uint32_t max_shared_memory,
-                             bool allocate_gpu_memory);
-
-void scratch_cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index,
-                             int8_t **wop_pbs_buffer, uint32_t *delta_log_array,
-                             uint32_t *cbs_delta_log, uint32_t glwe_dimension,
-                             uint32_t lwe_dimension, uint32_t polynomial_size,
-                             uint32_t level_count_cbs, uint32_t level_count_bsk,
-                             uint32_t *number_of_bits_to_extract_array,
-                             uint32_t crt_decomposition_size,
-                             uint32_t max_shared_memory,
-                             bool allocate_gpu_memory);
-
-void cuda_circuit_bootstrap_vertical_packing_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *fourier_bsk, void *cbs_fpksk, void *lut_vector, int8_t *cbs_vp_buffer,
-    uint32_t cbs_delta_log, uint32_t polynomial_size, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t level_count_bsk, uint32_t base_log_bsk,
-    uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
-    uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
-    uint32_t max_shared_memory);
-
-void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-                     void *lwe_array_in, void *lut_vector, void *fourier_bsk,
-                     void *ksk, void *cbs_fpksk, int8_t *wop_pbs_buffer,
-                     uint32_t cbs_delta_log, uint32_t glwe_dimension,
-                     uint32_t lwe_dimension, uint32_t polynomial_size,
-                     uint32_t base_log_bsk, uint32_t level_count_bsk,
-                     uint32_t base_log_ksk, uint32_t level_count_ksk,
-                     uint32_t base_log_pksk, uint32_t level_count_pksk,
-                     uint32_t base_log_cbs, uint32_t level_count_cbs,
-                     uint32_t *number_of_bits_to_extract_array,
-                     uint32_t *delta_log_array, uint32_t crt_decomposition_size,
-                     uint32_t max_shared_memory);
-
-void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
-                          int8_t **wop_pbs_buffer);
-
-void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
-                                                     uint32_t gpu_index,
-                                                     int8_t **cbs_vp_buffer);
-
-uint64_t get_buffer_size_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
-
-uint64_t get_buffer_size_bootstrap_low_latency_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
-}
-
-#ifdef __CUDACC__
-__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
-                                         int glwe_dimension,
-                                         uint32_t level_count);
-
-template <typename T>
-__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count);
-
-template <typename T>
-__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count);
-
-template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
-
-#endif
-
-#endif // CUDA_BOOTSTRAP_H
--- a/backends/concrete-cuda/implementation/include/bootstrap_multibit.h
+++ b/backends/concrete-cuda/implementation/include/bootstrap_multibit.h
@@ -1,42 +0,0 @@
-#ifndef CUDA_MULTI_BIT_H
-#define CUDA_MULTI_BIT_H
-
-#include <cstdint>
-
-extern "C" {
-void cuda_convert_lwe_multi_bit_bootstrap_key_64(
-    void *dest, void *src, void *v_stream, uint32_t gpu_index,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size, uint32_t grouping_factor);
-
-void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t chunk_size = 0);
-
-void scratch_cuda_multi_bit_pbs_64(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t chunk_size = 0);
-
-void cleanup_cuda_multi_bit_pbs(void *v_stream, uint32_t gpu_index,
-                                int8_t **pbs_buffer);
-}
-#ifdef __CUDACC__
-__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
-                                     uint32_t level_count,
-                                     uint32_t glwe_dimension,
-                                     uint32_t num_samples);
-
-__host__ uint64_t get_max_buffer_size_multibit_bootstrap(uint32_t
-                                                             lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t max_input_lwe_ciphertext_count);
-#endif
-
-#endif // CUDA_MULTI_BIT_H
--- a/backends/concrete-cuda/implementation/include/ciphertext.h
+++ b/backends/concrete-cuda/implementation/include/ciphertext.h
@@ -1,18 +0,0 @@
-#ifndef CUDA_CIPHERTEXT_H
-#define CUDA_CIPHERTEXT_H
-
-#include <cstdint>
-
-extern "C" {
-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
-                                                  void *v_stream,
-                                                  uint32_t gpu_index,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension);
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
-                                                  void *v_stream,
-                                                  uint32_t gpu_index,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension);
-};
-#endif
--- a/backends/concrete-cuda/implementation/include/circuit_bootstrap.h
+++ b/backends/concrete-cuda/implementation/include/circuit_bootstrap.h
@@ -1,44 +0,0 @@
-#ifndef CUDA_CIRCUIT_BOOTSTRAP_H
-#define CUDA_CIRCUIT_BOOTSTRAP_H
-
-#include <cstdint>
-
-extern "C" {
-
-void scratch_cuda_circuit_bootstrap_32(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count_bsk, uint32_t level_count_cbs,
-    uint32_t number_of_inputs, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_circuit_bootstrap_64(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count_bsk, uint32_t level_count_cbs,
-    uint32_t number_of_inputs, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void cuda_circuit_bootstrap_32(
-    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
-    void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
-    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
-    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
-    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
-    uint32_t max_shared_memory);
-
-void cuda_circuit_bootstrap_64(
-    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
-    void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
-    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
-    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
-    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
-    uint32_t max_shared_memory);
-
-void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
-                                    int8_t **cbs_buffer);
-}
-
-#endif // CUDA_CIRCUIT_BOOTSTRAP_H
--- a/backends/concrete-cuda/implementation/include/device.h
+++ b/backends/concrete-cuda/implementation/include/device.h
@@ -1,65 +0,0 @@
-#ifndef DEVICE_H
-#define DEVICE_H
-
-#pragma once
-
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <cuda_runtime.h>
-
-extern "C" {
-cudaStream_t *cuda_create_stream(uint32_t gpu_index);
-
-int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index);
-
-void *cuda_malloc(uint64_t size, uint32_t gpu_index);
-
-void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
-                        uint32_t gpu_index);
-
-int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
-
-int cuda_check_support_cooperative_groups();
-
-int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size,
-                       uint32_t gpu_index);
-
-int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                             cudaStream_t *stream, uint32_t gpu_index);
-
-int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size,
-                       uint32_t gpu_index);
-
-int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                             cudaStream_t *stream, uint32_t gpu_index);
-
-int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                      cudaStream_t *stream, uint32_t gpu_index);
-
-int cuda_get_number_of_gpus();
-
-int cuda_synchronize_device(uint32_t gpu_index);
-
-int cuda_drop(void *ptr, uint32_t gpu_index);
-
-int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index);
-
-int cuda_get_max_shared_memory(uint32_t gpu_index);
-
-int cuda_synchronize_stream(void *v_stream);
-
-#define check_cuda_error(ans)                                                  \
-  { cuda_error((ans), __FILE__, __LINE__); }
-inline void cuda_error(cudaError_t code, const char *file, int line,
-                       bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort)
-      exit(code);
-  }
-}
-}
-#endif
--- a/backends/concrete-cuda/implementation/include/helper_debug.cuh
+++ b/backends/concrete-cuda/implementation/include/helper_debug.cuh
@@ -1,100 +0,0 @@
-#include "cuComplex.h"
-#include "thrust/complex.h"
-#include <iostream>
-#include <string>
-#include <type_traits>
-
-#define PRINT_VARS
-#ifdef PRINT_VARS
-#define PRINT_DEBUG_5(var, begin, end, step, cond)                             \
-  _print_debug(var, #var, begin, end, step, cond, "", false)
-#define PRINT_DEBUG_6(var, begin, end, step, cond, text)                       \
-  _print_debug(var, #var, begin, end, step, cond, text, true)
-#define CAT(A, B) A##B
-#define PRINT_SELECT(NAME, NUM) CAT(NAME##_, NUM)
-#define GET_COUNT(_1, _2, _3, _4, _5, _6, COUNT, ...) COUNT
-#define VA_SIZE(...) GET_COUNT(__VA_ARGS__, 6, 5, 4, 3, 2, 1)
-#define PRINT_DEBUG(...)                                                       \
-  PRINT_SELECT(PRINT_DEBUG, VA_SIZE(__VA_ARGS__))(__VA_ARGS__)
-#else
-#define PRINT_DEBUG(...)
-#endif
-
-template <typename T>
-__device__ typename std::enable_if<std::is_unsigned<T>::value, void>::type
-_print_debug(T *var, const char *var_name, int start, int end, int step,
-             bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %u\n", var_name, i, var[i]);
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__ typename std::enable_if<std::is_signed<T>::value, void>::type
-_print_debug(T *var, const char *var_name, int start, int end, int step,
-             bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %d\n", var_name, i, var[i]);
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__ typename std::enable_if<std::is_floating_point<T>::value, void>::type
-_print_debug(T *var, const char *var_name, int start, int end, int step,
-             bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %.15f\n", var_name, i, var[i]);
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__
-    typename std::enable_if<std::is_same<T, thrust::complex<double>>::value,
-                            void>::type
-    _print_debug(T *var, const char *var_name, int start, int end, int step,
-                 bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].real(),
-             var[i].imag());
-    }
-  }
-  __syncthreads();
-}
-
-template <typename T>
-__device__
-    typename std::enable_if<std::is_same<T, cuDoubleComplex>::value, void>::type
-    _print_debug(T *var, const char *var_name, int start, int end, int step,
-                 bool cond, const char *text, bool has_text) {
-  __syncthreads();
-  if (cond) {
-    if (has_text)
-      printf("%s\n", text);
-    for (int i = start; i < end; i += step) {
-      printf("%s[%u]: %.15f , %.15f\n", var_name, i, var[i].x, var[i].y);
-    }
-  }
-  __syncthreads();
-}
--- a/backends/concrete-cuda/implementation/include/keyswitch.h
+++ b/backends/concrete-cuda/implementation/include/keyswitch.h
@@ -1,33 +0,0 @@
-#ifndef CNCRT_KS_H_
-#define CNCRT_KS_H_
-
-#include <cstdint>
-
-extern "C" {
-
-void cuda_keyswitch_lwe_ciphertext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
-
-void cuda_keyswitch_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
-
-void cuda_fp_keyswitch_lwe_to_glwe_32(
-    void *v_stream, uint32_t gpu_index, void *glwe_array_out,
-    void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
-    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
-    uint32_t number_of_keys);
-
-void cuda_fp_keyswitch_lwe_to_glwe_64(
-    void *v_stream, uint32_t gpu_index, void *glwe_array_out,
-    void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
-    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
-    uint32_t number_of_keys);
-}
-
-#endif // CNCRT_KS_H_
--- a/backends/concrete-cuda/implementation/include/linear_algebra.h
+++ b/backends/concrete-cuda/implementation/include/linear_algebra.h
@@ -1,89 +0,0 @@
-#ifndef CUDA_LINALG_H_
-#define CUDA_LINALG_H_
-
-#include <cstdint>
-
-extern "C" {
-
-// Three types of pbs are available for integer multiplication
-enum PBS_TYPE { MULTI_BIT = 0, LOW_LAT = 1, AMORTIZED = 2 };
-
-void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count);
-void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
-                                       void *lwe_array_out,
-                                       void *lwe_array_in_1,
-                                       void *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
-                                       void *lwe_array_out,
-                                       void *lwe_array_in_1,
-                                       void *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *plaintext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count);
-void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *plaintext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count);
-void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *cleartext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count);
-void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *cleartext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count);
-
-void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-    void *v_stream, uint32_t gpu_index, void *mem_ptr, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
-    PBS_TYPE pbs_type, uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void cuda_integer_mult_radix_ciphertext_kb_64(
-    void *v_stream, uint32_t gpu_index, void *radix_lwe_out,
-    void *radix_lwe_left, void *radix_lwe_right, uint32_t *ct_degree_out,
-    uint32_t *ct_degree_left, uint32_t *ct_degree_right, void *bsk, void *ksk,
-    void *mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
-    uint32_t ks_level, uint32_t num_blocks, PBS_TYPE pbs_type,
-    uint32_t max_shared_memory);
-
-void scratch_cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
-    void *mem_ptr, void *bsk, void *ksk, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
-    PBS_TYPE pbs_type, uint32_t max_shared_memory, bool allocate_gpu_memory);
-
-void cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
-    void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
-    uint32_t *ct_degree_out, uint32_t *ct_degree_left,
-    uint32_t *ct_degree_right, void *bsk, void *ksk, void *mem_ptr,
-    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
-    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
-    uint32_t num_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory);
-
-
-}
-
-
-
-
-#endif // CUDA_LINALG_H_
--- a/backends/concrete-cuda/implementation/include/vertical_packing.h
+++ b/backends/concrete-cuda/implementation/include/vertical_packing.h
@@ -1,63 +0,0 @@
-#ifndef VERTICAL_PACKING_H
-#define VERTICAL_PACKING_H
-
-#include <cstdint>
-
-extern "C" {
-
-void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
-                               int8_t **cmux_tree_buffer,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t level_count,
-                               uint32_t lut_vector_size, uint32_t tau,
-                               uint32_t max_shared_memory,
-                               bool allocate_gpu_memory);
-
-void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
-                               int8_t **cmux_tree_buffer,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t level_count,
-                               uint32_t lut_vector_size, uint32_t tau,
-                               uint32_t max_shared_memory,
-                               bool allocate_gpu_memory);
-
-void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
-                       void *ggsw_in, void *lut_vector,
-                       int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
-                       uint32_t polynomial_size, uint32_t base_log,
-                       uint32_t level_count, uint32_t lut_vector_size,
-                       uint32_t tau, uint32_t max_shared_memory);
-
-void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
-                       void *ggsw_in, void *lut_vector,
-                       int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
-                       uint32_t polynomial_size, uint32_t base_log,
-                       uint32_t level_count, uint32_t lut_vector_size,
-                       uint32_t tau, uint32_t max_shared_memory);
-
-void cleanup_cuda_cmux_tree(void *v_stream, uint32_t gpu_index,
-                            int8_t **cmux_tree_buffer);
-
-void scratch_cuda_blind_rotation_sample_extraction_32(
-    void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void scratch_cuda_blind_rotation_sample_extraction_64(
-    void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
-
-void cuda_blind_rotate_and_sample_extraction_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
-    void *lut_vector, int8_t *br_se_buffer, uint32_t mbr_size, uint32_t tau,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t l_gadget, uint32_t max_shared_memory);
-
-void cleanup_cuda_blind_rotation_sample_extraction(void *v_stream,
-                                                   uint32_t gpu_index,
-                                                   int8_t **br_se_buffer);
-}
-#endif // VERTICAL_PACKING_H
--- a/backends/concrete-cuda/implementation/src/CMakeLists.txt
+++ b/backends/concrete-cuda/implementation/src/CMakeLists.txt
@@ -1,13 +0,0 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/boolean_gates.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h)
-file(GLOB SOURCES "*.cu" "*.h" "fft/*.cu")
-add_library(concrete_cuda STATIC ${SOURCES})
-set_target_properties(
-  concrete_cuda
-  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
-             CUDA_RESOLVE_DEVICE_SYMBOLS ON
-             CUDA_ARCHITECTURES native)
-target_link_libraries(concrete_cuda PUBLIC cudart)
-target_include_directories(concrete_cuda PRIVATE .)
--- a/backends/concrete-cuda/implementation/src/addition.cu
+++ b/backends/concrete-cuda/implementation/src/addition.cu
@@ -1,111 +0,0 @@
-#include "addition.cuh"
-
-/*
- * Perform the addition of two u32 input LWE ciphertext vectors.
- * See the equivalent operation on u64 ciphertexts for more details.
- */
-void cuda_add_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
-                                       void *lwe_array_out,
-                                       void *lwe_array_in_1,
-                                       void *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count) {
-
-  host_addition(v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in_1),
-                static_cast<uint32_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
-}
-
-/*
- * Perform the addition of two u64 input LWE ciphertext vectors.
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- * - `lwe_array_out` is an array of size
- * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
- * been allocated on the GPU before calling this function, and that will hold
- * the result of the computation.
- * - `lwe_array_in_1` is the first LWE ciphertext vector used as input, it
- * should have been allocated and initialized before calling this function. It
- * has the same size as the output array.
- * - `lwe_array_in_2` is the second LWE ciphertext vector used as input, it
- * should have been allocated and initialized before calling this function. It
- * has the same size as the output array.
- * - `input_lwe_dimension` is the number of mask elements in the two input and
- * in the output ciphertext vectors
- * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
- * input LWE ciphertext vector, as well as in the output.
- *
- * Each element (mask element or body) of the input LWE ciphertext vector 1 is
- * added to the corresponding element in the input LWE ciphertext 2. The result
- * is stored in the output LWE ciphertext vector. The two input LWE ciphertext
- * vectors are left unchanged. This function is a wrapper to a device function
- * that performs the operation on the GPU.
- */
-void cuda_add_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
-                                       void *lwe_array_out,
-                                       void *lwe_array_in_1,
-                                       void *lwe_array_in_2,
-                                       uint32_t input_lwe_dimension,
-                                       uint32_t input_lwe_ciphertext_count) {
-
-  host_addition(v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in_1),
-                static_cast<uint64_t *>(lwe_array_in_2), input_lwe_dimension,
-                input_lwe_ciphertext_count);
-}
-/*
- * Perform the addition of a u32 input LWE ciphertext vector with a u32
- * plaintext vector. See the equivalent operation on u64 data for more details.
- */
-void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *plaintext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count) {
-
-  host_addition_plaintext(v_stream, gpu_index,
-                          static_cast<uint32_t *>(lwe_array_out),
-                          static_cast<uint32_t *>(lwe_array_in),
-                          static_cast<uint32_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
-}
-/*
- * Perform the addition of a u64 input LWE ciphertext vector with a u64 input
- * plaintext vector.
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- * - `lwe_array_out` is an array of size
- * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
- * been allocated on the GPU before calling this function, and that will hold
- * the result of the computation.
- * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
- * been allocated and initialized before calling this function. It has the same
- * size as the output array.
- * - `plaintext_array_in` is the plaintext vector used as input, it should have
- * been allocated and initialized before calling this function. It should be of
- * size `input_lwe_ciphertext_count`.
- * - `input_lwe_dimension` is the number of mask elements in the input and
- * output LWE ciphertext vectors
- * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
- * input LWE ciphertext vector, as well as in the output. It is also the number
- * of plaintexts in the input plaintext vector.
- *
- * Each plaintext of the input plaintext vector is added to the body of the
- * corresponding LWE ciphertext in the LWE ciphertext vector. The result of the
- * operation is stored in the output LWE ciphertext vector. The two input
- * vectors are unchanged. This function is a wrapper to a device function that
- * performs the operation on the GPU.
- */
-void cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *plaintext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count) {
-
-  host_addition_plaintext(v_stream, gpu_index,
-                          static_cast<uint64_t *>(lwe_array_out),
-                          static_cast<uint64_t *>(lwe_array_in),
-                          static_cast<uint64_t *>(plaintext_array_in),
-                          input_lwe_dimension, input_lwe_ciphertext_count);
-}
--- a/backends/concrete-cuda/implementation/src/addition.cuh
+++ b/backends/concrete-cuda/implementation/src/addition.cuh
@@ -1,87 +0,0 @@
-#ifndef CUDA_ADD_H
-#define CUDA_ADD_H
-
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#include "device.h"
-#include "linear_algebra.h"
-#include "utils/kernel_dimensions.cuh"
-#include <stdio.h>
-
-template <typename T>
-__global__ void addition(T *output, T *input_1, T *input_2,
-                         uint32_t num_entries) {
-
-  int tid = threadIdx.x;
-  int index = blockIdx.x * blockDim.x + tid;
-  if (index < num_entries) {
-    // Here we take advantage of the wrapping behaviour of uint
-    output[index] = input_1[index] + input_2[index];
-  }
-}
-
-template <typename T>
-__global__ void plaintext_addition(T *output, T *lwe_input, T *plaintext_input,
-                                   uint32_t input_lwe_dimension,
-                                   uint32_t num_entries) {
-
-  int tid = threadIdx.x;
-  int plaintext_index = blockIdx.x * blockDim.x + tid;
-  if (plaintext_index < num_entries) {
-    int index =
-        plaintext_index * (input_lwe_dimension + 1) + input_lwe_dimension;
-    // Here we take advantage of the wrapping behaviour of uint
-    output[index] = lwe_input[index] + plaintext_input[plaintext_index];
-  }
-}
-
-template <typename T>
-__host__ void host_addition(void *v_stream, uint32_t gpu_index, T *output,
-                            T *input_1, T *input_2,
-                            uint32_t input_lwe_dimension,
-                            uint32_t input_lwe_ciphertext_count) {
-
-  cudaSetDevice(gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = input_lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count * lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  addition<<<grid, thds, 0, *stream>>>(output, input_1, input_2, num_entries);
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename T>
-__host__ void host_addition_plaintext(void *v_stream, uint32_t gpu_index,
-                                      T *output, T *lwe_input,
-                                      T *plaintext_input,
-                                      uint32_t input_lwe_dimension,
-                                      uint32_t input_lwe_ciphertext_count) {
-
-  cudaSetDevice(gpu_index);
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  check_cuda_error(cudaMemcpyAsync(output, lwe_input,
-                                   (input_lwe_dimension + 1) *
-                                       input_lwe_ciphertext_count * sizeof(T),
-                                   cudaMemcpyDeviceToDevice, *stream));
-  plaintext_addition<<<grid, thds, 0, *stream>>>(
-      output, lwe_input, plaintext_input, input_lwe_dimension, num_entries);
-  check_cuda_error(cudaGetLastError());
-}
-#endif // CUDA_ADD_H
--- a/backends/concrete-cuda/implementation/src/bit_extraction.cu
+++ b/backends/concrete-cuda/implementation/src/bit_extraction.cu
@@ -1,356 +0,0 @@
-#include "bit_extraction.cuh"
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_fast_extract_bits(int glwe_dimension, int polynomial_size,
-                              int level_count_bsk, int crt_decomposition_size) {
-
-  assert(("Error (GPU extract bits): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of samples should be lower than four time the number of
-  // streaming multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being
-  // related to the occupancy of 50%).
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the number of streaming multiprocessors on the device "
-          "divided by 4 * (k + 1) "
-          "level_count_bsk",
-          crt_decomposition_size <=
-              number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
-}
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_extract_bits(int nbits, int glwe_dimension, int polynomial_size,
-                         int base_log_bsk, int level_count_bsk,
-                         int crt_decomposition_size) {
-
-  assert(("Error (GPU extract bits): base log should be <= nbits",
-          base_log_bsk <= nbits));
-  checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count_bsk,
-                           crt_decomposition_size);
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the bit extraction on 32 bits inputs, into `cbs_buffer`. It also
- * configures SM options on the GPU in case FULLSM mode is going to be used.
- */
-void scratch_cuda_extract_bits_32(
-    void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t crt_decomposition_size,
-    uint32_t max_shared_memory, bool allocate_gpu_memory) {
-
-  checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count,
-                           crt_decomposition_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_extract_bits<uint32_t, int32_t, Degree<256>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_extract_bits<uint32_t, int32_t, Degree<512>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_extract_bits<uint32_t, int32_t, Degree<1024>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_extract_bits<uint32_t, int32_t, Degree<2048>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_extract_bits<uint32_t, int32_t, Degree<4096>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_extract_bits<uint32_t, int32_t, Degree<8192>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the bit extraction on 64 bits inputs, into `cbs_buffer`. It also
- * configures SM options on the GPU in case FULLSM mode is going to be used.
- */
-void scratch_cuda_extract_bits_64(
-    void *v_stream, uint32_t gpu_index, int8_t **bit_extract_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t crt_decomposition_size,
-    uint32_t max_shared_memory, bool allocate_gpu_memory) {
-  checks_fast_extract_bits(glwe_dimension, polynomial_size, level_count,
-                           crt_decomposition_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_extract_bits<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_extract_bits<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_extract_bits<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_extract_bits<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_extract_bits<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_extract_bits<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, bit_extract_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_count, crt_decomposition_size, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/* Perform bit extract on a batch of 32 bit LWE ciphertexts.
- * See the corresponding function on 64 bit LWE ciphertexts for more details.
- */
-void cuda_extract_bits_32(void *v_stream, uint32_t gpu_index,
-                          void *list_lwe_array_out, void *lwe_array_in,
-                          int8_t *bit_extract_buffer, void *ksk,
-                          void *fourier_bsk, uint32_t *number_of_bits_array,
-                          uint32_t *delta_log_array, uint32_t lwe_dimension_in,
-                          uint32_t lwe_dimension_out, uint32_t glwe_dimension,
-                          uint32_t polynomial_size, uint32_t base_log_bsk,
-                          uint32_t level_count_bsk, uint32_t base_log_ksk,
-                          uint32_t level_count_ksk,
-                          uint32_t crt_decomposition_size,
-                          uint32_t max_shared_memory) {
-  checks_extract_bits(32, glwe_dimension, polynomial_size, base_log_bsk,
-                      level_count_bsk, crt_decomposition_size);
-
-  switch (polynomial_size) {
-  case 256:
-    host_extract_bits<uint32_t, Degree<256>>(
-        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
-        (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 512:
-    host_extract_bits<uint32_t, Degree<512>>(
-        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
-        (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 1024:
-    host_extract_bits<uint32_t, Degree<1024>>(
-        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
-        (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 2048:
-    host_extract_bits<uint32_t, Degree<2048>>(
-        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
-        (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 4096:
-    host_extract_bits<uint32_t, Degree<4096>>(
-        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
-        (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 8192:
-    host_extract_bits<uint32_t, Degree<8192>>(
-        v_stream, gpu_index, (uint32_t *)list_lwe_array_out,
-        (uint32_t *)lwe_array_in, bit_extract_buffer, (uint32_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/* Perform bit extract on a batch of 64 bit lwe ciphertexts.
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - 'number_of_bits' will be extracted from each ciphertext
- * starting at the bit number 'delta_log' (0-indexed) included.
- * Output bits are ordered from the MSB to LSB. Every extracted bit is
- * represented as an LWE ciphertext, containing the encryption of the bit scaled
- * by q/2.
- *  - 'list_lwe_array_out' output batch LWE ciphertexts for each bit of every
- * input ciphertext
- *  - 'lwe_array_in' batch of input LWE ciphertexts, with size -
- * ('lwe_dimension_in' + 1) * crt_decomposition_size * sizeof(u64)
- * The following 5 parameters are used during calculations, they are not actual
- * inputs of the function they are just allocated memory for calculation
- * process, like this, memory can be allocated once and can be used as much
- * as needed for different calls of extract_bit function.
- *  - 'lwe_array_in_buffer' same size as 'lwe_array_in'
- *  - 'lwe_array_in_shifted_buffer' same size as 'lwe_array_in'
- *  - 'lwe_array_out_ks_buffer'  with size:
- * ('lwe_dimension_out' + 1) * crt_decomposition_size * sizeof(u64)
- *  - 'lwe_array_out_pbs_buffer' same size as 'lwe_array_in'
- *  - 'lut_pbs' with size:
- * (glwe_dimension + 1) * (lwe_dimension_in + 1) * sizeof(u64)
- * The other inputs are:
- *  - 'lut_vector_indexes' stores the index corresponding to which test
- * vector to use
- *  - 'ksk' keyswitch key
- *  - 'fourier_bsk'  complex compressed bsk in fourier domain
- *  - 'lwe_dimension_in' input LWE ciphertext dimension, supported input
- * dimensions are: {256, 512, 1024,2048, 4096, 8192}
- *  - 'lwe_dimension_out' output LWE ciphertext dimension
- *  - 'glwe_dimension' GLWE dimension,  only glwe_dimension = 1 is supported
- * for now
- *  - 'base_log_bsk' base_log for bootstrapping
- *  - 'level_count_bsk' decomposition level count for bootstrapping
- *  - 'base_log_ksk' base_log for keyswitch
- *  - 'level_count_ksk' decomposition level for keyswitch
- *  - 'crt_decomposition_size' number of input LWE ciphertexts
- *  - 'max_shared_memory' maximum amount of shared memory to be used inside
- * device functions
- *
- * This function will call corresponding template of wrapper host function which
- * will manage the calls of device functions.
- */
-void cuda_extract_bits_64(void *v_stream, uint32_t gpu_index,
-                          void *list_lwe_array_out, void *lwe_array_in,
-                          int8_t *bit_extract_buffer, void *ksk,
-                          void *fourier_bsk, uint32_t *number_of_bits_array,
-                          uint32_t *delta_log_array, uint32_t lwe_dimension_in,
-                          uint32_t lwe_dimension_out, uint32_t glwe_dimension,
-                          uint32_t polynomial_size, uint32_t base_log_bsk,
-                          uint32_t level_count_bsk, uint32_t base_log_ksk,
-                          uint32_t level_count_ksk,
-                          uint32_t crt_decomposition_size,
-                          uint32_t max_shared_memory) {
-  checks_extract_bits(64, glwe_dimension, polynomial_size, base_log_bsk,
-                      level_count_bsk, crt_decomposition_size);
-
-  switch (polynomial_size) {
-  case 256:
-    host_extract_bits<uint64_t, Degree<256>>(
-        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
-        (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 512:
-    host_extract_bits<uint64_t, Degree<512>>(
-        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
-        (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 1024:
-    host_extract_bits<uint64_t, Degree<1024>>(
-        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
-        (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 2048:
-    host_extract_bits<uint64_t, Degree<2048>>(
-        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
-        (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 4096:
-    host_extract_bits<uint64_t, Degree<4096>>(
-        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
-        (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 8192:
-    host_extract_bits<uint64_t, Degree<8192>>(
-        v_stream, gpu_index, (uint64_t *)list_lwe_array_out,
-        (uint64_t *)lwe_array_in, bit_extract_buffer, (uint64_t *)ksk,
-        (double2 *)fourier_bsk, number_of_bits_array, delta_log_array,
-        lwe_dimension_in, lwe_dimension_out, glwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_ksk, level_count_ksk,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This cleanup function frees the data for the bit extraction on GPU in
- * bit_extract_buffer for 32 or 64 bits inputs.
- */
-void cleanup_cuda_extract_bits(void *v_stream, uint32_t gpu_index,
-                               int8_t **bit_extract_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*bit_extract_buffer, stream, gpu_index);
-}
--- a/backends/concrete-cuda/implementation/src/bit_extraction.cuh
+++ b/backends/concrete-cuda/implementation/src/bit_extraction.cuh
@@ -1,347 +0,0 @@
-#ifndef BIT_EXTRACT_CUH
-#define BIT_EXTRACT_CUH
-
-#include "bit_extraction.h"
-#include "bootstrap_fast_low_latency.cuh"
-#include "device.h"
-#include "keyswitch.cuh"
-#include "polynomial/parameters.cuh"
-#include "utils/timer.cuh"
-
-/*
- * Function copies batch lwe input to one that is shifted by value
- * works for ciphertexts with sizes supported by params::degree
- *
- * Each x-block handles a params::degree-chunk of src
- */
-template <typename Torus, class params>
-__global__ void copy_and_shift_lwe(Torus *dst_shift, Torus *src, Torus value,
-                                   uint32_t glwe_dimension) {
-  int tid = threadIdx.x;
-  auto cur_dst_shift = &dst_shift[blockIdx.x * params::degree];
-  auto cur_src = &src[blockIdx.x * params::degree];
-
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    cur_dst_shift[tid] = cur_src[tid] * value;
-    tid += params::degree / params::opt;
-  }
-
-  if (threadIdx.x == 0 && blockIdx.x == 0) {
-    cur_dst_shift[glwe_dimension * params::degree] =
-        cur_src[glwe_dimension * params::degree] * value;
-  }
-}
-
-/*
- * Function copies batch of lwe to lwe when size is not supported by
- * params::degree
- */
-template <typename Torus>
-__global__ void copy_small_lwe(Torus *dst, Torus *src, uint32_t small_lwe_size,
-                               uint32_t number_of_bits, uint32_t lwe_id) {
-
-  size_t blockId = blockIdx.x;
-  size_t threads_per_block = blockDim.x;
-  size_t opt = small_lwe_size / threads_per_block;
-  size_t rem = small_lwe_size & (threads_per_block - 1);
-
-  auto cur_lwe_list = &dst[blockId * small_lwe_size * number_of_bits];
-  auto cur_dst = &cur_lwe_list[lwe_id * small_lwe_size];
-  auto cur_src = &src[blockId * small_lwe_size];
-
-  size_t tid = threadIdx.x;
-  for (int i = 0; i < opt; i++) {
-    cur_dst[tid] = cur_src[tid];
-    tid += threads_per_block;
-  }
-
-  if (threadIdx.x < rem)
-    cur_dst[tid] = cur_src[tid];
-}
-
-/*
- * Function used to wrapping add value on the body of ciphertexts,
- * should be called with blocksize.x = 1;
- * blickIdx.x refers id of ciphertext
- * NOTE: check if putting thi functionality in copy_small_lwe or fill_pbs_lut
- * is faster
- */
-template <typename Torus>
-__global__ void add_to_body(Torus *lwe, size_t lwe_dimension, Torus value) {
-  lwe[blockIdx.x * (lwe_dimension + 1) + lwe_dimension] += value;
-}
-
-/*
- * Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption of 0
- * if the extracted bit was 0 and 1 in the other case
- * Remove the extracted bit from the state LWE to get a 0 at the extracted bit
- * location.
- * Shift on padding bit for next iteration, that's why
- * alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 2) is used
- * instead of alpha= 1ll << (ciphertext_n_bits - delta_log - bit_idx - 1)
- */
-template <typename Torus, class params>
-__global__ void add_sub_and_mul_lwe(Torus *shifted_lwe, Torus *state_lwe,
-                                    Torus *pbs_lwe_array_out, Torus add_value,
-                                    Torus mul_value, uint32_t glwe_dimension) {
-  size_t tid = threadIdx.x;
-  size_t blockId = blockIdx.x;
-  auto cur_shifted_lwe =
-      &shifted_lwe[blockId * (glwe_dimension * params::degree + 1)];
-  auto cur_state_lwe =
-      &state_lwe[blockId * (glwe_dimension * params::degree + 1)];
-  auto cur_pbs_lwe_array_out =
-      &pbs_lwe_array_out[blockId * (glwe_dimension * params::degree + 1)];
-#pragma unroll
-  for (int i = 0; i < glwe_dimension * params::opt; i++) {
-    cur_shifted_lwe[tid] = cur_state_lwe[tid] -= cur_pbs_lwe_array_out[tid];
-    cur_shifted_lwe[tid] *= mul_value;
-    tid += params::degree / params::opt;
-  }
-
-  if (threadIdx.x == 0) {
-    cur_shifted_lwe[glwe_dimension * params::degree] =
-        cur_state_lwe[glwe_dimension * params::degree] -=
-        (cur_pbs_lwe_array_out[glwe_dimension * params::degree] + add_value);
-    cur_shifted_lwe[glwe_dimension * params::degree] *= mul_value;
-  }
-}
-
-/*
- * Fill lut(only body) for the current bit, equivalent to trivial encryption as
- * msk is 0s
- * blockIdx.x refers id of lut vector
- */
-template <typename Torus, class params>
-__global__ void fill_lut_body_for_current_bit(Torus *lut, Torus value,
-                                              uint32_t glwe_dimension) {
-
-  Torus *cur_poly = &lut[(blockIdx.x * (glwe_dimension + 1) + glwe_dimension) *
-                         params::degree];
-  size_t tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    cur_poly[tid] = value;
-    tid += params::degree / params::opt;
-  }
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_extract_bits(
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t crt_decomposition_size) {
-
-  uint64_t buffer_size =
-      sizeof(Torus) // lut_vector_indexes
-      + ((glwe_dimension + 1) * polynomial_size) * sizeof(Torus) // lut_pbs
-      + (glwe_dimension * polynomial_size + 1) *
-            sizeof(Torus) // lwe_array_in_buffer
-      + (glwe_dimension * polynomial_size + 1) *
-            sizeof(Torus)                   // lwe_array_in_shifted_buffer
-      + (lwe_dimension + 1) * sizeof(Torus) // lwe_array_out_ks_buffer
-      + (glwe_dimension * polynomial_size + 1) *
-            sizeof(Torus); // lwe_array_out_pbs_buffer
-  buffer_size =
-      (buffer_size + buffer_size % sizeof(double2)) * crt_decomposition_size;
-  return buffer_size;
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void
-scratch_extract_bits(void *v_stream, uint32_t gpu_index,
-                     int8_t **bit_extract_buffer, uint32_t glwe_dimension,
-                     uint32_t lwe_dimension, uint32_t polynomial_size,
-                     uint32_t level_count, uint32_t crt_decomposition_size,
-                     uint32_t max_shared_memory, bool allocate_gpu_memory) {
-
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t buffer_size = get_buffer_size_extract_bits<Torus>(
-                             glwe_dimension, lwe_dimension, polynomial_size,
-                             crt_decomposition_size) +
-                         get_buffer_size_bootstrap_fast_low_latency<Torus>(
-                             glwe_dimension, polynomial_size, level_count,
-                             crt_decomposition_size, max_shared_memory);
-  // allocate and initialize device pointers for bit extraction
-  if (allocate_gpu_memory) {
-    *bit_extract_buffer =
-        (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-  }
-
-  // lut_vector_indexes is the last buffer in the bit_extract_buffer
-  // it's hard set to 0: only one LUT is given as input, it's the same for all
-  // LWE inputs For simplicity we initialize the whole buffer to 0
-  check_cuda_error(
-      cudaMemsetAsync(*bit_extract_buffer, 0, buffer_size, *stream));
-
-  scratch_bootstrap_fast_low_latency<Torus, STorus, params>(
-      v_stream, gpu_index, bit_extract_buffer, glwe_dimension, polynomial_size,
-      level_count, crt_decomposition_size, max_shared_memory, false);
-}
-
-/*
- * Host function for cuda single ciphertext extract bits.
- * it executes device functions in specific order and manages
- * parallelism
- */
-template <typename Torus, class params>
-__host__ void host_single_ciphertext_extract_bits(
-    void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
-    Torus *lwe_array_in, int8_t *bit_extract_buffer, Torus *ksk,
-    double2 *fourier_bsk, uint32_t number_of_bits, uint32_t delta_log,
-    uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log_bsk,
-    uint32_t level_count_bsk, uint32_t base_log_ksk, uint32_t level_count_ksk,
-    uint32_t max_shared_memory) {
-
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
-
-  int threads = params::degree / params::opt;
-
-  // Always define the PBS buffer first, because it has the strongest memory
-  // alignment requirement (16 bytes for double2)
-  int8_t *pbs_buffer = (int8_t *)bit_extract_buffer;
-  Torus *lut_pbs =
-      (Torus *)pbs_buffer +
-      (ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
-                      glwe_dimension, polynomial_size, level_count_bsk, 1,
-                      max_shared_memory) /
-                  sizeof(Torus));
-  Torus *lwe_array_in_buffer =
-      (Torus *)lut_pbs + (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
-  Torus *lwe_array_in_shifted_buffer =
-      (Torus *)lwe_array_in_buffer +
-      (ptrdiff_t)(glwe_dimension * polynomial_size + 1);
-  Torus *lwe_array_out_ks_buffer =
-      (Torus *)lwe_array_in_shifted_buffer +
-      (ptrdiff_t)(glwe_dimension * polynomial_size + 1);
-  Torus *lwe_array_out_pbs_buffer =
-      (Torus *)lwe_array_out_ks_buffer + (ptrdiff_t)(lwe_dimension_out + 1);
-  // lut_vector_indexes is the last array in the bit_extract buffer
-  Torus *lut_vector_indexes =
-      (Torus *)lwe_array_out_pbs_buffer +
-      (ptrdiff_t)((glwe_dimension * polynomial_size + 1));
-
-  // shift lwe on padding bit and copy in new buffer
-  check_cuda_error(
-      cudaMemcpyAsync(lwe_array_in_buffer, lwe_array_in,
-                      (glwe_dimension * polynomial_size + 1) * sizeof(Torus),
-                      cudaMemcpyDeviceToDevice, *stream));
-  copy_and_shift_lwe<Torus, params><<<glwe_dimension, threads, 0, *stream>>>(
-      lwe_array_in_shifted_buffer, lwe_array_in,
-      (Torus)(1ll << (ciphertext_n_bits - delta_log - 1)), glwe_dimension);
-  check_cuda_error(cudaGetLastError());
-
-  for (int bit_idx = 0; bit_idx < number_of_bits; bit_idx++) {
-    cuda_keyswitch_lwe_ciphertext_vector(
-        v_stream, gpu_index, lwe_array_out_ks_buffer,
-        lwe_array_in_shifted_buffer, ksk, lwe_dimension_in, lwe_dimension_out,
-        base_log_ksk, level_count_ksk, 1);
-    copy_small_lwe<<<1, 256, 0, *stream>>>(
-        list_lwe_array_out, lwe_array_out_ks_buffer, lwe_dimension_out + 1,
-        number_of_bits, number_of_bits - bit_idx - 1);
-    check_cuda_error(cudaGetLastError());
-
-    if (bit_idx == number_of_bits - 1) {
-      break;
-    }
-
-    // Add q/4 to center the error while computing a negacyclic LUT
-    add_to_body<Torus>
-        <<<1, 1, 0, *stream>>>(lwe_array_out_ks_buffer, lwe_dimension_out,
-                               (Torus)(1ll << (ciphertext_n_bits - 2)));
-    check_cuda_error(cudaGetLastError());
-
-    // Fill lut for the current bit (equivalent to trivial encryption as mask is
-    // 0s) The LUT is filled with -alpha in each coefficient where alpha =
-    // delta*2^{bit_idx-1}
-    fill_lut_body_for_current_bit<Torus, params><<<1, threads, 0, *stream>>>(
-        lut_pbs, (Torus)(0ll - 1ll << (delta_log - 1 + bit_idx)),
-        glwe_dimension);
-    check_cuda_error(cudaGetLastError());
-    host_bootstrap_fast_low_latency<Torus, params>(
-        v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_pbs,
-        lut_vector_indexes, lwe_array_out_ks_buffer, fourier_bsk, pbs_buffer,
-        glwe_dimension, lwe_dimension_out, polynomial_size, base_log_bsk,
-        level_count_bsk, 1, 1, max_shared_memory);
-
-    // Add alpha where alpha = delta*2^{bit_idx-1} to end up with an encryption
-    // of 0 if the extracted bit was 0 and 1 in the other case
-    add_sub_and_mul_lwe<Torus, params><<<1, threads, 0, *stream>>>(
-        lwe_array_in_shifted_buffer, lwe_array_in_buffer,
-        lwe_array_out_pbs_buffer, (Torus)(1ll << (delta_log - 1 + bit_idx)),
-        (Torus)(1ll << (ciphertext_n_bits - delta_log - bit_idx - 2)),
-        glwe_dimension);
-    check_cuda_error(cudaGetLastError());
-  }
-}
-
-/*
- * Host function for cuda extract bits.
- * it executes device functions in specific order and manages
- * parallelism
- */
-template <typename Torus, class params>
-__host__ void
-host_extract_bits(void *v_stream, uint32_t gpu_index, Torus *list_lwe_array_out,
-                  Torus *lwe_array_in, int8_t *bit_extract_buffer, Torus *ksk,
-                  double2 *fourier_bsk, uint32_t *number_of_bits_array,
-                  uint32_t *delta_log_array, uint32_t lwe_dimension_in,
-                  uint32_t lwe_dimension_out, uint32_t glwe_dimension,
-                  uint32_t polynomial_size, uint32_t base_log_bsk,
-                  uint32_t level_count_bsk, uint32_t base_log_ksk,
-                  uint32_t level_count_ksk, uint32_t crt_decomposition_size,
-                  uint32_t max_shared_memory) {
-
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  cudaStream_t *sub_streams[crt_decomposition_size];
-  for (int i = 0; i < crt_decomposition_size; i++) {
-    sub_streams[i] = cuda_create_stream(gpu_index);
-  }
-
-  int bit_extract_buffer_size =
-      get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension_out,
-                                          polynomial_size, 1) +
-      get_buffer_size_bootstrap_fast_low_latency<Torus>(
-          glwe_dimension, polynomial_size, level_count_bsk, 1,
-          max_shared_memory);
-
-  int cur_total_lwe = 0;
-  for (int i = 0; i < crt_decomposition_size; i++) {
-    uint32_t number_of_bits = number_of_bits_array[i];
-    auto cur_input_lwe = &lwe_array_in[i * (lwe_dimension_in + 1)];
-    auto cur_output_lwe_array =
-        &list_lwe_array_out[cur_total_lwe * (lwe_dimension_out + 1)];
-    auto cur_bit_extract_buffer =
-        &bit_extract_buffer[i * bit_extract_buffer_size];
-    host_single_ciphertext_extract_bits<Torus, params>(
-        (void *)sub_streams[i], gpu_index, cur_output_lwe_array, cur_input_lwe,
-        cur_bit_extract_buffer, ksk, fourier_bsk, number_of_bits,
-        delta_log_array[i], lwe_dimension_in, lwe_dimension_out, glwe_dimension,
-        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
-        level_count_ksk, max_shared_memory);
-    cur_total_lwe += number_of_bits_array[i];
-  }
-
-  cudaEvent_t event;
-  cudaEventCreate(&event);
-
-  for (int i = 0; i < crt_decomposition_size; i++) {
-    cudaEventRecord(event, *(sub_streams[i]));
-    cudaStreamWaitEvent(*stream, event, 0);
-  }
-
-  for (int i = 0; i < crt_decomposition_size; i++) {
-    cuda_destroy_stream((sub_streams[i]), gpu_index);
-  }
-
-  cudaEventDestroy(event);
-}
-
-#endif // BIT_EXTRACT_CUH
--- a/backends/concrete-cuda/implementation/src/boolean_gates.cu
+++ b/backends/concrete-cuda/implementation/src/boolean_gates.cu
@@ -1,686 +0,0 @@
-#ifndef CUDA_BOOLEAN_GATES_CU
-#define CUDA_BOOLEAN_GATES_CU
-
-#include "bootstrap.h"
-#include "device.h"
-#include "keyswitch.h"
-#include "linear_algebra.h"
-
-constexpr uint32_t PLAINTEXT_TRUE{1 << (32 - 3)};
-constexpr uint32_t PLAINTEXT_FALSE{static_cast<uint32_t>(7 << (32 - 3))};
-
-extern "C" void cuda_boolean_not_32(void *v_stream, uint32_t gpu_index,
-                                    void *lwe_array_out, void *lwe_array_in,
-                                    uint32_t input_lwe_dimension,
-                                    uint32_t input_lwe_ciphertext_count) {
-
-  cuda_negate_lwe_ciphertext_vector_32(v_stream, gpu_index, lwe_array_out,
-                                       lwe_array_in, input_lwe_dimension,
-                                       input_lwe_ciphertext_count);
-}
-
-extern "C" void cuda_boolean_and_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-    void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
-    void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
-    uint32_t ks_base_log, uint32_t ks_level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  // 1. Add the two ciphertexts
-  cuda_add_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-  // 2. Add "false" plaintext, where "false" is 7 << (32 - 3)
-  uint32_t *h_false_plaintext_array =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_false_plaintext_array[index] = PLAINTEXT_FALSE;
-  }
-  uint32_t *false_plaintext_array = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-      v_stream, gpu_index, lwe_buffer_2, lwe_buffer_1, false_plaintext_array,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_buffer_1, stream, gpu_index);
-  cuda_drop_async(false_plaintext_array, stream, gpu_index);
-  free(h_false_plaintext_array);
-
-  // 3. Compute a PBS with the LUT created below
-  uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
-                                           polynomial_size * sizeof(uint32_t));
-  for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
-       index++) {
-    h_pbs_lut[index] =
-        index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
-  }
-  uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
-      gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
-                           (glwe_dimension + 1) * polynomial_size *
-                               sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *h_pbs_lut_indexes =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_pbs_lut_indexes[index] = 0;
-  }
-  uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
-          sizeof(uint32_t),
-      stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
-  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
-      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
-      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
-  check_cuda_error(cudaGetLastError());
-
-  cuda_drop_async(lwe_buffer_2, stream, gpu_index);
-  cuda_drop_async(pbs_lut, stream, gpu_index);
-  cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
-  free(h_pbs_lut);
-  free(h_pbs_lut_indexes);
-
-  cuda_keyswitch_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
-      glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
-      ks_level_count, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
-}
-
-extern "C" void cuda_boolean_nand_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-    void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
-    void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
-    uint32_t ks_base_log, uint32_t ks_level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  // 1. Add the two ciphertexts
-  cuda_add_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-  // 2. Negate ciphertext
-  uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_negate_lwe_ciphertext_vector_32(v_stream, gpu_index, lwe_buffer_2,
-                                       lwe_buffer_1, input_lwe_dimension,
-                                       input_lwe_ciphertext_count);
-  cuda_drop_async(lwe_buffer_1, stream, gpu_index);
-  // 3. Add "true" plaintext, where "true" is 1 << (32 - 3)
-  uint32_t *h_true_plaintext_array =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_true_plaintext_array[index] = PLAINTEXT_TRUE;
-  }
-  uint32_t *true_plaintext_array = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-      v_stream, gpu_index, lwe_buffer_3, lwe_buffer_2, true_plaintext_array,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_buffer_2, stream, gpu_index);
-  cuda_drop_async(true_plaintext_array, stream, gpu_index);
-  free(h_true_plaintext_array);
-
-  // 3. Compute a PBS with the LUT created below
-  uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
-                                           polynomial_size * sizeof(uint32_t));
-  for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
-       index++) {
-    h_pbs_lut[index] =
-        index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
-  }
-  uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
-      gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
-                           (glwe_dimension + 1) * polynomial_size *
-                               sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *h_pbs_lut_indexes =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_pbs_lut_indexes[index] = 0;
-  }
-  uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
-          sizeof(uint32_t),
-      stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
-  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
-      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
-      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
-  check_cuda_error(cudaGetLastError());
-
-  cuda_drop_async(lwe_buffer_3, stream, gpu_index);
-  cuda_drop_async(pbs_lut, stream, gpu_index);
-  cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
-  free(h_pbs_lut);
-  free(h_pbs_lut_indexes);
-
-  cuda_keyswitch_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
-      glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
-      ks_level_count, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
-}
-
-extern "C" void cuda_boolean_nor_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-    void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
-    void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
-    uint32_t ks_base_log, uint32_t ks_level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  // 1. Add the two ciphertexts
-  cuda_add_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-  // 2. Negate ciphertext
-  uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_negate_lwe_ciphertext_vector_32(v_stream, gpu_index, lwe_buffer_2,
-                                       lwe_buffer_1, input_lwe_dimension,
-                                       input_lwe_ciphertext_count);
-  cuda_drop_async(lwe_buffer_1, stream, gpu_index);
-  // 3. Add "false" plaintext
-  uint32_t *h_false_plaintext_array =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_false_plaintext_array[index] = PLAINTEXT_FALSE;
-  }
-  uint32_t *false_plaintext_array = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(false_plaintext_array, h_false_plaintext_array,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-      v_stream, gpu_index, lwe_buffer_3, lwe_buffer_2, false_plaintext_array,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_buffer_2, stream, gpu_index);
-  cuda_drop_async(false_plaintext_array, stream, gpu_index);
-  free(h_false_plaintext_array);
-
-  // 3. Compute a PBS with the LUT created below
-  uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
-                                           polynomial_size * sizeof(uint32_t));
-  for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
-       index++) {
-    h_pbs_lut[index] =
-        index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
-  }
-  uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
-      gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
-                           (glwe_dimension + 1) * polynomial_size *
-                               sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *h_pbs_lut_indexes =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_pbs_lut_indexes[index] = 0;
-  }
-  uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
-          sizeof(uint32_t),
-      stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
-  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
-      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
-      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
-  check_cuda_error(cudaGetLastError());
-
-  cuda_drop_async(lwe_buffer_3, stream, gpu_index);
-  cuda_drop_async(pbs_lut, stream, gpu_index);
-  cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
-  free(h_pbs_lut);
-  free(h_pbs_lut_indexes);
-
-  cuda_keyswitch_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
-      glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
-      ks_level_count, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
-}
-
-extern "C" void cuda_boolean_or_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-    void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
-    void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
-    uint32_t ks_base_log, uint32_t ks_level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  // 1. Add the two ciphertexts
-  cuda_add_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-  // 2. Add "true" plaintext
-  uint32_t *h_true_plaintext_array =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_true_plaintext_array[index] = PLAINTEXT_TRUE;
-  }
-  uint32_t *true_plaintext_array = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-      v_stream, gpu_index, lwe_buffer_2, lwe_buffer_1, true_plaintext_array,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_buffer_1, stream, gpu_index);
-  cuda_drop_async(true_plaintext_array, stream, gpu_index);
-  free(h_true_plaintext_array);
-
-  // 3. Compute a PBS with the LUT created below
-  uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
-                                           polynomial_size * sizeof(uint32_t));
-  for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
-       index++) {
-    h_pbs_lut[index] =
-        index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
-  }
-  uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
-      gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
-                           (glwe_dimension + 1) * polynomial_size *
-                               sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *h_pbs_lut_indexes =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_pbs_lut_indexes[index] = 0;
-  }
-  uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
-          sizeof(uint32_t),
-      stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
-  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_2, bootstrapping_key, pbs_buffer, input_lwe_dimension,
-      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
-      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
-  check_cuda_error(cudaGetLastError());
-
-  cuda_drop_async(lwe_buffer_2, stream, gpu_index);
-  cuda_drop_async(pbs_lut, stream, gpu_index);
-  cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
-  free(h_pbs_lut);
-  free(h_pbs_lut_indexes);
-
-  cuda_keyswitch_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
-      glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
-      ks_level_count, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
-}
-
-extern "C" void cuda_boolean_xor_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-    void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
-    void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
-    uint32_t ks_base_log, uint32_t ks_level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  // 1. Add the two ciphertexts
-  cuda_add_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-  // 2. Add "true" plaintext
-  uint32_t *h_true_plaintext_array =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_true_plaintext_array[index] = PLAINTEXT_TRUE;
-  }
-  uint32_t *true_plaintext_array = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-      v_stream, gpu_index, lwe_buffer_2, lwe_buffer_1, true_plaintext_array,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_buffer_1, stream, gpu_index);
-  cuda_drop_async(true_plaintext_array, stream, gpu_index);
-  free(h_true_plaintext_array);
-
-  // 3. Multiply by 2
-  uint32_t *h_cleartext_array =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_cleartext_array[index] = 2;
-  }
-  uint32_t *cleartext_array = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-      v_stream, gpu_index, lwe_buffer_3, lwe_buffer_2, cleartext_array,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-  cuda_drop_async(lwe_buffer_2, stream, gpu_index);
-
-  // 4. Compute a PBS with the LUT created below
-  uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
-                                           polynomial_size * sizeof(uint32_t));
-  for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
-       index++) {
-    h_pbs_lut[index] =
-        index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
-  }
-  uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
-      gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
-                           (glwe_dimension + 1) * polynomial_size *
-                               sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *h_pbs_lut_indexes =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_pbs_lut_indexes[index] = 0;
-  }
-  uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
-          sizeof(uint32_t),
-      stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
-  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_3, bootstrapping_key, pbs_buffer, input_lwe_dimension,
-      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
-      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
-  check_cuda_error(cudaGetLastError());
-
-  cuda_drop_async(lwe_buffer_3, stream, gpu_index);
-  cuda_drop_async(pbs_lut, stream, gpu_index);
-  cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
-  free(h_pbs_lut);
-  free(h_pbs_lut_indexes);
-
-  cuda_keyswitch_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
-      glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
-      ks_level_count, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
-}
-
-extern "C" void cuda_boolean_xnor_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-    void *lwe_array_in_1, void *lwe_array_in_2, void *bootstrapping_key,
-    void *ksk, uint32_t input_lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level_count,
-    uint32_t ks_base_log, uint32_t ks_level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint32_t *lwe_buffer_1 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  // 1. Add the two ciphertexts
-  cuda_add_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_buffer_1, lwe_array_in_1, lwe_array_in_2,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-  // 2. Add "true" plaintext
-  uint32_t *h_true_plaintext_array =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_true_plaintext_array[index] = PLAINTEXT_TRUE;
-  }
-  uint32_t *true_plaintext_array = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(true_plaintext_array, h_true_plaintext_array,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t *lwe_buffer_2 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
-      v_stream, gpu_index, lwe_buffer_2, lwe_buffer_1, true_plaintext_array,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_buffer_1, stream, gpu_index);
-  cuda_drop_async(true_plaintext_array, stream, gpu_index);
-  free(h_true_plaintext_array);
-  // 3. Negate ciphertext
-  uint32_t *lwe_buffer_3 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_negate_lwe_ciphertext_vector_32(v_stream, gpu_index, lwe_buffer_3,
-                                       lwe_buffer_2, input_lwe_dimension,
-                                       input_lwe_ciphertext_count);
-  cuda_drop_async(lwe_buffer_2, stream, gpu_index);
-  // 4. Multiply by 2
-  uint32_t *h_cleartext_array =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_cleartext_array[index] = 2;
-  }
-  uint32_t *cleartext_array = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(cleartext_array, h_cleartext_array,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t *lwe_buffer_4 = (uint32_t *)cuda_malloc_async(
-      (input_lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint32_t),
-      stream, gpu_index);
-  cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-      v_stream, gpu_index, lwe_buffer_4, lwe_buffer_3, cleartext_array,
-      input_lwe_dimension, input_lwe_ciphertext_count);
-  cuda_drop_async(lwe_buffer_3, stream, gpu_index);
-
-  // 5. Compute a PBS with the LUT created below
-  uint32_t *h_pbs_lut = (uint32_t *)malloc((glwe_dimension + 1) *
-                                           polynomial_size * sizeof(uint32_t));
-  for (uint index = 0; index < (glwe_dimension + 1) * polynomial_size;
-       index++) {
-    h_pbs_lut[index] =
-        index < (glwe_dimension * polynomial_size) ? 0 : PLAINTEXT_TRUE;
-  }
-  uint32_t *pbs_lut = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension + 1) * polynomial_size * sizeof(uint32_t), stream,
-      gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut, h_pbs_lut,
-                           (glwe_dimension + 1) * polynomial_size *
-                               sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *h_pbs_lut_indexes =
-      (uint32_t *)malloc(input_lwe_ciphertext_count * sizeof(uint32_t));
-  for (uint index = 0; index < input_lwe_ciphertext_count; index++) {
-    h_pbs_lut_indexes[index] = 0;
-  }
-  uint32_t *pbs_lut_indexes = (uint32_t *)cuda_malloc_async(
-      input_lwe_ciphertext_count * sizeof(uint32_t), stream, gpu_index);
-  cuda_memcpy_async_to_gpu(pbs_lut_indexes, h_pbs_lut_indexes,
-                           input_lwe_ciphertext_count * sizeof(uint32_t),
-                           stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-  uint32_t *lwe_pbs_buffer = (uint32_t *)cuda_malloc_async(
-      (glwe_dimension * polynomial_size + 1) * input_lwe_ciphertext_count *
-          sizeof(uint32_t),
-      stream, gpu_index);
-  check_cuda_error(cudaGetLastError());
-
-  int8_t *pbs_buffer = nullptr;
-  scratch_cuda_bootstrap_low_latency_32(
-      v_stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level_count, input_lwe_ciphertext_count, max_shared_memory, true);
-  cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_pbs_buffer, pbs_lut, pbs_lut_indexes,
-      lwe_buffer_4, bootstrapping_key, pbs_buffer, input_lwe_dimension,
-      glwe_dimension, polynomial_size, pbs_base_log, pbs_level_count,
-      input_lwe_ciphertext_count, 1, 0, max_shared_memory);
-  cleanup_cuda_bootstrap_low_latency(v_stream, gpu_index, &pbs_buffer);
-  check_cuda_error(cudaGetLastError());
-
-  cuda_drop_async(lwe_buffer_4, stream, gpu_index);
-  cuda_drop_async(pbs_lut, stream, gpu_index);
-  cuda_drop_async(pbs_lut_indexes, stream, gpu_index);
-  free(h_pbs_lut);
-  free(h_pbs_lut_indexes);
-
-  cuda_keyswitch_lwe_ciphertext_vector_32(
-      v_stream, gpu_index, lwe_array_out, lwe_pbs_buffer, ksk,
-      glwe_dimension * polynomial_size, input_lwe_dimension, ks_base_log,
-      ks_level_count, input_lwe_ciphertext_count);
-
-  cuda_drop_async(lwe_pbs_buffer, stream, gpu_index);
-}
-#endif // CUDA_BOOLEAN_GATES_CU
--- a/backends/concrete-cuda/implementation/src/bootstrap.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap.cu
@@ -1 +0,0 @@
-#include "crypto/bootstrapping_key.cuh"
--- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cu
@@ -1,378 +0,0 @@
-#include <err.h>
-#include "bootstrap_amortized.cuh"
-
-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_bootstrap_amortized_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-  return get_buffer_size_bootstrap_amortized<uint64_t>(
-      glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-      max_shared_memory);
-}
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_fast_bootstrap_amortized(int polynomial_size) {
-  assert(
-      ("Error (GPU amortized PBS): polynomial size should be one of 256, 512, "
-       "1024, 2048, 4096, 8192, 16384",
-       polynomial_size == 256 || polynomial_size == 512 ||
-           polynomial_size == 1024 || polynomial_size == 2048 ||
-           polynomial_size == 4096 || polynomial_size == 8192 ||
-           polynomial_size == 16384));
-}
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_bootstrap_amortized(int nbits, int base_log, int polynomial_size) {
-  assert(("Error (GPU amortized PBS): base log should be <= nbits",
-          base_log <= nbits));
-  checks_fast_bootstrap_amortized(polynomial_size);
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the amortized PBS on 32 bits inputs, into `pbs_buffer`. It also
- * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
- * be used.
- */
-void scratch_cuda_bootstrap_amortized_32(void *v_stream, uint32_t gpu_index,
-                                         int8_t **pbs_buffer,
-                                         uint32_t glwe_dimension,
-                                         uint32_t polynomial_size,
-                                         uint32_t input_lwe_ciphertext_count,
-                                         uint32_t max_shared_memory,
-                                         bool allocate_gpu_memory) {
-  checks_fast_bootstrap_amortized(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<256>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<512>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<1024>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<2048>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<4096>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<8192>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 16384:
-    scratch_bootstrap_amortized<uint32_t, int32_t, AmortizedDegree<16384>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the amortized PBS on 64 bits inputs, into `pbs_buffer`. It also
- * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
- * be used.
- */
-void scratch_cuda_bootstrap_amortized_64(void *v_stream, uint32_t gpu_index,
-                                         int8_t **pbs_buffer,
-                                         uint32_t glwe_dimension,
-                                         uint32_t polynomial_size,
-                                         uint32_t input_lwe_ciphertext_count,
-                                         uint32_t max_shared_memory,
-                                         bool allocate_gpu_memory) {
-  checks_fast_bootstrap_amortized(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<256>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<512>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<1024>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<2048>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<4096>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<8192>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 16384:
-    scratch_bootstrap_amortized<uint64_t, int64_t, AmortizedDegree<16384>>(
-        v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-/* Perform the programmable bootstrapping on a batch of input u32 LWE
- * ciphertexts. See the corresponding operation on 64 bits for more details.
- */
-void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
-
-  checks_bootstrap_amortized(32, base_log, polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    host_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
-        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 512:
-    host_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
-        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 1024:
-    host_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
-        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 2048:
-    host_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
-        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 4096:
-    host_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
-        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 8192:
-    host_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
-        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 16384:
-    host_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
-        v_stream, gpu_index, (uint32_t *)lwe_array_out, (uint32_t *)lut_vector,
-        (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-/* Perform the programmable bootstrapping on a batch of input u64 LWE
- * ciphertexts. This functions performs best for large numbers of inputs (> 10).
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
- * (a0,..an-1,b) where n is the LWE dimension
- *  - lut_vector: should hold as many test vectors of size polynomial_size
- * as there are input ciphertexts, but actually holds
- * num_lut_vectors vectors to reduce memory usage
- *  - lut_vector_indexes: stores the index corresponding to
- * which test vector of lut_vector to use for each LWE input in
- * lwe_array_in
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
- * mask values + 1 body value
- *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
- * under secret key sk2
- * bsk = Z + sk1 H
- * where H is the gadget matrix and Z is a matrix (k+1).l
- * containing GLWE encryptions of 0 under sk2.
- * bsk is thus a tensor of size (k+1)^2.l.N.n
- * where l is the number of decomposition levels and
- * k is the GLWE dimension, N is the polynomial size for
- * GLWE. The polynomial size for GLWE and the test vector
- * are the same because they have to be in the same ring
- * to be multiplied.
- * - input_lwe_dimension: size of the Torus vector used to encrypt the input
- * LWE ciphertexts - referred to as n above (~ 600)
- * - polynomial_size: size of the test polynomial (test vector) and size of the
- * GLWE polynomials (~1024) (where `size` refers to the polynomial degree + 1).
- * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
- * - level_count: number of decomposition levels in the gadget matrix (~4)
- * - num_samples: number of encrypted input messages
- * - num_lut_vectors: parameter to set the actual number of test vectors to be
- * used
- * - lwe_idx: the index of the LWE input to consider for the GPU of index
- * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
- * the input LWE array is copied to each GPU, but the whole LUT array is copied
- * (because the case when the number of LUTs is smaller than the number of input
- * LWEs is not trivial to take into account in the data repartition on the
- * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
- * input in the LUT array `lut_vector`.
- *  - 'max_shared_memory' maximum amount of shared memory to be used inside
- * device functions
- *
- * This function calls a wrapper to a device kernel that performs the
- * bootstrapping:
- * 	- the kernel is templatized based on integer discretization and
- * polynomial degree
- * 	- num_samples blocks of threads are launched, where each thread is going
- * to handle one or more polynomial coefficients at each stage:
- * 		- perform the blind rotation
- * 		- round the result
- * 		- decompose into level_count levels, then for each level:
- * 		  - switch to the FFT domain
- * 		  - multiply with the bootstrapping key
- * 		  - come back to the coefficients representation
- * 	- between each stage a synchronization of the threads is necessary
- * 	- in case the device has enough shared memory, temporary arrays used for
- * the different stages (accumulators) are stored into the shared memory
- * 	- the accumulators serve to combine the results for all decomposition
- * levels
- * 	- the constant memory (64K) is used for storing the roots of identity
- * values for the FFT
- */
-void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
-
-  checks_bootstrap_amortized(64, base_log, polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    host_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 512:
-    host_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 1024:
-    host_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 2048:
-    host_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 4096:
-    host_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 8192:
-    host_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  case 16384:
-    host_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out, (uint64_t *)lut_vector,
-        (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
-        (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, num_samples, num_lut_vectors,
-        lwe_idx, max_shared_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-/*
- * This cleanup function frees the data for the amortized PBS on GPU in
- * pbs_buffer for 32 or 64 bits inputs.
- */
-void cleanup_cuda_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
-                                      int8_t **pbs_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*pbs_buffer, stream, gpu_index);
-}
--- a/backends/concrete-cuda/implementation/src/bootstrap_amortized.cuh
+++ b/backends/concrete-cuda/implementation/src/bootstrap_amortized.cuh
@@ -1,363 +0,0 @@
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#ifndef CNCRT_AMORTIZED_PBS_H
-#define CNCRT_AMORTIZED_PBS_H
-
-#include "bootstrap.h"
-#include "complex/operations.cuh"
-#include "crypto/gadget.cuh"
-#include "crypto/torus.cuh"
-#include "device.h"
-#include "fft/bnsmfft.cuh"
-#include "fft/twiddles.cuh"
-#include "polynomial/functions.cuh"
-#include "polynomial/parameters.cuh"
-#include "polynomial/polynomial.cuh"
-#include "polynomial/polynomial_math.cuh"
-#include "utils/timer.cuh"
-
-template <typename Torus, class params, sharedMemDegree SMD>
-/*
- * Kernel launched by host_bootstrap_amortized
- *
- * Uses shared memory to increase performance
- *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
- * (a0,..an-1,b) where n is the LWE dimension
- *  - lut_vector: should hold as many test vectors of size polynomial_size
- * as there are input ciphertexts, but actually holds
- * num_lut_vectors vectors to reduce memory usage
- *  - lut_vector_indexes: stores the index corresponding to which test vector
- * to use for each sample in lut_vector
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
- * mask values + 1 body value
- *  - bootstrapping_key: RGSW encryption of the LWE secret key sk1 under secret
- * key sk2
- *  - device_mem: pointer to the device's global memory in case we use it (SMD
- * == NOSM or PARTIALSM)
- *  - lwe_dimension: size of the Torus vector used to encrypt the input
- * LWE ciphertexts - referred to as n above (~ 600)
- *  - polynomial_size: size of the test polynomial (test vector) and size of the
- * GLWE polynomial (~1024)
- *  - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
- *  - level_count: number of decomposition levels in the gadget matrix (~4)
- *  - gpu_num: index of the current GPU (useful for multi-GPU computations)
- *  - lwe_idx: equal to the number of samples per gpu x gpu_num
- *  - device_memory_size_per_sample: amount of global memory to allocate if SMD
- * is not FULLSM
- */
-__global__ void device_bootstrap_amortized(
-    Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
-    Torus *lwe_array_in, double2 *bootstrapping_key, int8_t *device_mem,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
-    size_t device_memory_size_per_sample) {
-  // We use shared memory for the polynomials that are used often during the
-  // bootstrap, since shared memory is kept in L1 cache and accessing it is
-  // much faster than global memory
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-
-  if constexpr (SMD == FULLSM)
-    selected_memory = sharedmem;
-  else
-    selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
-
-  // For GPU bootstrapping the GLWE dimension is hard-set to 1: there is only
-  // one mask polynomial and 1 body to handle.
-  Torus *accumulator = (Torus *)selected_memory;
-  Torus *accumulator_rotated =
-      (Torus *)accumulator +
-      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size);
-  double2 *res_fft =
-      (double2 *)accumulator_rotated + (glwe_dimension + 1) * polynomial_size /
-                                           (sizeof(double2) / sizeof(Torus));
-  double2 *accumulator_fft = (double2 *)sharedmem;
-  if constexpr (SMD != PARTIALSM)
-    accumulator_fft = (double2 *)res_fft +
-                      (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
-
-  auto block_lwe_array_in = &lwe_array_in[blockIdx.x * (lwe_dimension + 1)];
-  Torus *block_lut_vector =
-      &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
-                  (glwe_dimension + 1)];
-
-  // Put "b", the body, in [0, 2N[
-  Torus b_hat = 0;
-  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                        2 * params::degree); // 2 * params::log2_degree + 1);
-
-  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                        params::degree / params::opt>(
-      accumulator, block_lut_vector, b_hat, false, glwe_dimension + 1);
-
-  // Loop over all the mask elements of the sample to accumulate
-  // (X^a_i-1) multiplication, decomposition of the resulting polynomial
-  // into level_count polynomials, and performing polynomial multiplication
-  // via an FFT with the RGSW encrypted secret key
-  for (int iteration = 0; iteration < lwe_dimension; iteration++) {
-    synchronize_threads_in_block();
-
-    // Put "a" in [0, 2N[ instead of Zq
-    Torus a_hat = 0;
-    rescale_torus_element(block_lwe_array_in[iteration], a_hat,
-                          2 * params::degree); // 2 * params::log2_degree + 1);
-
-    // Perform ACC * (X^ä - 1)
-    multiply_by_monomial_negacyclic_and_sub_polynomial<
-        Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat, glwe_dimension + 1);
-
-    synchronize_threads_in_block();
-
-    // Perform a rounding to increase the accuracy of the
-    // bootstrapped ciphertext
-    round_to_closest_multiple_inplace<Torus, params::opt,
-                                      params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count, glwe_dimension + 1);
-
-    // Initialize the polynomial multiplication via FFT arrays
-    // The polynomial multiplications happens at the block level
-    // and each thread handles two or more coefficients
-    int pos = threadIdx.x;
-    for (int i = 0; i < (glwe_dimension + 1); i++)
-      for (int j = 0; j < params::opt / 2; j++) {
-        res_fft[pos].x = 0;
-        res_fft[pos].y = 0;
-        pos += params::degree / params::opt;
-      }
-
-    GadgetMatrix<Torus, params> gadget(base_log, level_count,
-                                       accumulator_rotated, glwe_dimension + 1);
-    // Now that the rotation is done, decompose the resulting polynomial
-    // coefficients so as to multiply each decomposed level with the
-    // corresponding part of the bootstrapping key
-    for (int level = level_count - 1; level >= 0; level--) {
-      for (int i = 0; i < (glwe_dimension + 1); i++) {
-        gadget.decompose_and_compress_next_polynomial(accumulator_fft, i);
-
-        // Switch to the FFT space
-        NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
-
-        // Get the bootstrapping key piece necessary for the multiplication
-        // It is already in the Fourier domain
-        auto bsk_slice = get_ith_mask_kth_block(bootstrapping_key, iteration, i,
-                                                level, polynomial_size,
-                                                glwe_dimension, level_count);
-
-        // Perform the coefficient-wise product with the two pieces of
-        // bootstrapping key
-        for (int j = 0; j < (glwe_dimension + 1); j++) {
-          auto bsk_poly = bsk_slice + j * params::degree / 2;
-          auto res_fft_poly = res_fft + j * params::degree / 2;
-          polynomial_product_accumulate_in_fourier_domain<params, double2>(
-              res_fft_poly, accumulator_fft, bsk_poly);
-        }
-      }
-      synchronize_threads_in_block();
-    }
-
-    // Come back to the coefficient representation
-    if constexpr (SMD == FULLSM || SMD == NOSM) {
-      synchronize_threads_in_block();
-
-      for (int i = 0; i < (glwe_dimension + 1); i++) {
-        auto res_fft_slice = res_fft + i * params::degree / 2;
-        NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
-      }
-      synchronize_threads_in_block();
-
-      for (int i = 0; i < (glwe_dimension + 1); i++) {
-        auto accumulator_slice = accumulator + i * params::degree;
-        auto res_fft_slice = res_fft + i * params::degree / 2;
-        add_to_torus<Torus, params>(res_fft_slice, accumulator_slice);
-      }
-      synchronize_threads_in_block();
-    } else {
-#pragma unroll
-      for (int i = 0; i < (glwe_dimension + 1); i++) {
-        auto accumulator_slice = accumulator + i * params::degree;
-        auto res_fft_slice = res_fft + i * params::degree / 2;
-        int tid = threadIdx.x;
-        for (int j = 0; j < params::opt / 2; j++) {
-          accumulator_fft[tid] = res_fft_slice[tid];
-          tid = tid + params::degree / params::opt;
-        }
-        synchronize_threads_in_block();
-
-        NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
-        synchronize_threads_in_block();
-
-        add_to_torus<Torus, params>(accumulator_fft, accumulator_slice);
-      }
-      synchronize_threads_in_block();
-    }
-  }
-
-  auto block_lwe_array_out =
-      &lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1)];
-
-  // The blind rotation for this block is over
-  // Now we can perform the sample extraction: for the body it's just
-  // the resulting constant coefficient of the accumulator
-  // For the mask it's more complicated
-  sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator,
-                                     glwe_dimension);
-  sample_extract_body<Torus, params>(block_lwe_array_out, accumulator,
-                                     glwe_dimension);
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_full_sm_bootstrap_amortized(
-    uint32_t polynomial_size, uint32_t glwe_dimension) {
-  return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // accumulator
-         sizeof(Torus) * polynomial_size *
-             (glwe_dimension + 1) +              // accumulator rotated
-         sizeof(double2) * polynomial_size / 2 + // accumulator fft
-         sizeof(double2) * polynomial_size / 2 *
-             (glwe_dimension + 1); // res fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_bootstrap_amortized(uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_bootstrap_amortized(
-    uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
-      polynomial_size, glwe_dimension);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
-  uint64_t partial_dm = full_sm - partial_sm;
-  uint64_t full_dm = full_sm;
-  uint64_t device_mem = 0;
-  if (max_shared_memory < partial_sm) {
-    device_mem = full_dm * input_lwe_ciphertext_count;
-  } else if (max_shared_memory < full_sm) {
-    device_mem = partial_dm * input_lwe_ciphertext_count;
-  }
-  return device_mem + device_mem % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void scratch_bootstrap_amortized(void *v_stream, uint32_t gpu_index,
-                                          int8_t **pbs_buffer,
-                                          uint32_t glwe_dimension,
-                                          uint32_t polynomial_size,
-                                          uint32_t input_lwe_ciphertext_count,
-                                          uint32_t max_shared_memory,
-                                          bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
-      polynomial_size, glwe_dimension);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
-    cudaFuncSetAttribute(device_bootstrap_amortized<Torus, params, PARTIALSM>,
-                         cudaFuncAttributeMaxDynamicSharedMemorySize,
-                         partial_sm);
-    cudaFuncSetCacheConfig(device_bootstrap_amortized<Torus, params, PARTIALSM>,
-                           cudaFuncCachePreferShared);
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_bootstrap_amortized<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
-    check_cuda_error(cudaFuncSetCacheConfig(
-        device_bootstrap_amortized<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared));
-  }
-  if (allocate_gpu_memory) {
-    uint64_t buffer_size = get_buffer_size_bootstrap_amortized<Torus>(
-        glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-    check_cuda_error(cudaGetLastError());
-  }
-}
-
-template <typename Torus, class params>
-__host__ void host_bootstrap_amortized(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
-    uint32_t lwe_idx, uint32_t max_shared_memory) {
-
-  cudaSetDevice(gpu_index);
-  uint64_t SM_FULL = get_buffer_size_full_sm_bootstrap_amortized<Torus>(
-      polynomial_size, glwe_dimension);
-
-  uint64_t SM_PART =
-      get_buffer_size_partial_sm_bootstrap_amortized<Torus>(polynomial_size);
-
-  uint64_t DM_PART = SM_FULL - SM_PART;
-
-  uint64_t DM_FULL = SM_FULL;
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  // Create a 1-dimensional grid of threads
-  // where each block handles 1 sample and each thread
-  // handles opt polynomial coefficients
-  // (actually opt/2 coefficients since we compress the real polynomial into a
-  // complex)
-  dim3 grid(input_lwe_ciphertext_count, 1, 1);
-  dim3 thds(polynomial_size / params::opt, 1, 1);
-
-  // Launch the kernel using polynomial_size/opt threads
-  // where each thread computes opt polynomial coefficients
-  // Depending on the required amount of shared memory, choose
-  // from one of three templates (no use, partial use or full use
-  // of shared memory)
-  if (max_shared_memory < SM_PART) {
-    device_bootstrap_amortized<Torus, params, NOSM><<<grid, thds, 0, *stream>>>(
-        lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-        bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log, level_count, lwe_idx, DM_FULL);
-  } else if (max_shared_memory < SM_FULL) {
-    device_bootstrap_amortized<Torus, params, PARTIALSM>
-        <<<grid, thds, SM_PART, *stream>>>(
-            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-            polynomial_size, base_log, level_count, lwe_idx, DM_PART);
-  } else {
-    // For devices with compute capability 7.x a single thread block can
-    // address the full capacity of shared memory. Shared memory on the
-    // device then has to be allocated dynamically.
-    // For lower compute capabilities, this call
-    // just does nothing and the amount of shared memory used is 48 KB
-    device_bootstrap_amortized<Torus, params, FULLSM>
-        <<<grid, thds, SM_FULL, *stream>>>(
-            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension,
-            polynomial_size, base_log, level_count, lwe_idx, 0);
-  }
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename Torus, class params>
-int cuda_get_pbs_per_gpu(int polynomial_size) {
-
-  int blocks_per_sm = 0;
-  int num_threads = polynomial_size / params::opt;
-  cudaGetDeviceCount(0);
-  cudaDeviceProp device_properties;
-  cudaGetDeviceProperties(&device_properties, 0);
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &blocks_per_sm, device_bootstrap_amortized<Torus, params>, num_threads,
-      0);
-
-  return device_properties.multiProcessorCount * blocks_per_sm;
-}
-
-#endif // CNCRT_PBS_H
--- a/backends/concrete-cuda/implementation/src/bootstrap_fast_low_latency.cuh
+++ b/backends/concrete-cuda/implementation/src/bootstrap_fast_low_latency.cuh
@@ -1,452 +0,0 @@
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#ifndef LOWLAT_FAST_PBS_H
-#define LOWLAT_FAST_PBS_H
-
-#include "cooperative_groups.h"
-
-#include "bootstrap.h"
-#include "complex/operations.cuh"
-#include "crypto/gadget.cuh"
-#include "crypto/torus.cuh"
-#include "device.h"
-#include "fft/bnsmfft.cuh"
-#include "fft/twiddles.cuh"
-#include "polynomial/parameters.cuh"
-#include "polynomial/polynomial.cuh"
-#include "polynomial/polynomial_math.cuh"
-#include "utils/timer.cuh"
-
-// Cooperative groups are used in the low latency PBS
-using namespace cooperative_groups;
-namespace cg = cooperative_groups;
-
-template <typename Torus, class params>
-__device__ void mul_ggsw_glwe(Torus *accumulator, double2 *fft,
-                              double2 *join_buffer, double2 *bootstrapping_key,
-                              int polynomial_size, uint32_t glwe_dimension,
-                              int level_count, int iteration,
-                              grid_group &grid) {
-
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  // Get the pieces of the bootstrapping key that will be needed for the
-  // external product; blockIdx.x is the ID of the block that's executing
-  // this function, so we end up getting the lines of the bootstrapping key
-  // needed to perform the external product in this block (corresponding to
-  // the same decomposition level)
-  auto bsk_slice = get_ith_mask_kth_block(
-      bootstrapping_key, iteration, blockIdx.y, blockIdx.x, polynomial_size,
-      glwe_dimension, level_count);
-
-  // Selects all GLWEs in a particular decomposition level
-  auto level_join_buffer =
-      join_buffer + blockIdx.x * (glwe_dimension + 1) * params::degree / 2;
-
-  // Perform the matrix multiplication between the GGSW and the GLWE,
-  // each block operating on a single level for mask and body
-
-  // The first product is used to initialize level_join_buffer
-  auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
-  auto buffer_slice = level_join_buffer + blockIdx.y * params::degree / 2;
-
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    buffer_slice[tid] = fft[tid] * bsk_poly[tid];
-    tid += params::degree / params::opt;
-  }
-
-  grid.sync();
-
-  // Continues multiplying fft by every polynomial in that particular bsk level
-  // Each y-block accumulates in a different polynomial at each iteration
-  for (int j = 1; j < (glwe_dimension + 1); j++) {
-    int idx = (j + blockIdx.y) % (glwe_dimension + 1);
-
-    auto bsk_poly = bsk_slice + idx * params::degree / 2;
-    auto buffer_slice = level_join_buffer + idx * params::degree / 2;
-
-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      buffer_slice[tid] += fft[tid] * bsk_poly[tid];
-      tid += params::degree / params::opt;
-    }
-    grid.sync();
-  }
-
-  // -----------------------------------------------------------------
-  // All blocks are synchronized here; after this sync, level_join_buffer has
-  // the values needed from every other block
-
-  auto src_acc = join_buffer + blockIdx.y * params::degree / 2;
-
-  // copy first product into fft buffer
-  tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = src_acc[tid];
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
-  // accumulate rest of the products into fft buffer
-  for (int l = 1; l < gridDim.x; l++) {
-    auto cur_src_acc = &src_acc[l * (glwe_dimension + 1) * params::degree / 2];
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] += cur_src_acc[tid];
-      tid += params::degree / params::opt;
-    }
-  }
-
-  synchronize_threads_in_block();
-
-  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
-  // accumulator
-  NSMFFT_inverse<HalfDegree<params>>(fft);
-  synchronize_threads_in_block();
-
-  add_to_torus<Torus, params>(fft, accumulator);
-
-  __syncthreads();
-}
-
-template <typename Torus, class params, sharedMemDegree SMD>
-/*
- * Kernel launched by the low latency version of the
- * bootstrapping, that uses cooperative groups
- *
- * - lwe_array_out: vector of output lwe s, with length
- * (glwe_dimension * polynomial_size+1)*num_samples
- * - lut_vector: vector of look up tables with
- * length  (glwe_dimension+1) * polynomial_size * num_samples
- * - lut_vector_indexes: mapping between lwe_array_in and lut_vector
- * lwe_array_in: vector of lwe inputs with length (lwe_dimension + 1) *
- * num_samples
- *
- * Each y-block computes one element of the lwe_array_out.
- */
-__global__ void device_bootstrap_fast_low_latency(
-    Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
-    Torus *lwe_array_in, double2 *bootstrapping_key, double2 *join_buffer,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block) {
-
-  grid_group grid = this_grid();
-
-  // We use shared memory for the polynomials that are used often during the
-  // bootstrap, since shared memory is kept in L1 cache and accessing it is
-  // much faster than global memory
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-  uint32_t glwe_dimension = gridDim.y - 1;
-
-  if constexpr (SMD == FULLSM) {
-    selected_memory = sharedmem;
-  } else {
-    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
-                      blockIdx.z * gridDim.x * gridDim.y;
-    selected_memory = &device_mem[block_index * device_memory_size_per_block];
-  }
-
-  // We always compute the pointer with most restrictive alignment to avoid
-  // alignment issues
-  double2 *accumulator_fft = (double2 *)selected_memory;
-  Torus *accumulator =
-      (Torus *)accumulator_fft +
-      (ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
-  Torus *accumulator_rotated =
-      (Torus *)accumulator + (ptrdiff_t)polynomial_size;
-
-  if constexpr (SMD == PARTIALSM)
-    accumulator_fft = (double2 *)sharedmem;
-
-  // The third dimension of the block is used to determine on which ciphertext
-  // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
-
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
-
-  double2 *block_join_buffer =
-      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
-                   params::degree / 2];
-  // Since the space is L1 cache is small, we use the same memory location for
-  // the rotated accumulator and the fft accumulator, since we know that the
-  // rotated array is not in use anymore by the time we perform the fft
-
-  // Put "b" in [0, 2N[
-  Torus b_hat = 0;
-  rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                        2 * params::degree);
-
-  divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                        params::degree / params::opt>(
-      accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-      false);
-
-  for (int i = 0; i < lwe_dimension; i++) {
-    synchronize_threads_in_block();
-
-    // Put "a" in [0, 2N[
-    Torus a_hat = 0;
-    rescale_torus_element(block_lwe_array_in[i], a_hat,
-                          2 * params::degree); // 2 * params::log2_degree + 1);
-
-    // Perform ACC * (X^ä - 1)
-    multiply_by_monomial_negacyclic_and_sub_polynomial<
-        Torus, params::opt, params::degree / params::opt>(
-        accumulator, accumulator_rotated, a_hat);
-
-    // Perform a rounding to increase the accuracy of the
-    // bootstrapped ciphertext
-    round_to_closest_multiple_inplace<Torus, params::opt,
-                                      params::degree / params::opt>(
-        accumulator_rotated, base_log, level_count);
-
-    synchronize_threads_in_block();
-
-    // Decompose the accumulator. Each block gets one level of the
-    // decomposition, for the mask and the body (so block 0 will have the
-    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count,
-                                           accumulator_rotated);
-    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
-    synchronize_threads_in_block();
-
-    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, params>(
-        accumulator, accumulator_fft, block_join_buffer, bootstrapping_key,
-        polynomial_size, glwe_dimension, level_count, i, grid);
-
-    synchronize_threads_in_block();
-  }
-
-  auto block_lwe_array_out =
-      &lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) +
-                     blockIdx.y * polynomial_size];
-
-  if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-    // Perform a sample extract. At this point, all blocks have the result, but
-    // we do the computation at block 0 to avoid waiting for extra blocks, in
-    // case they're not synchronized
-    sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-  } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-    sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-  }
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_bootstrap_fast_low_latency(uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_bootstrap_fast_low_latency(
-    uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft mask & body
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_bootstrap_fast_low_latency(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
-      polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
-          polynomial_size);
-  uint64_t partial_dm = full_sm - partial_sm;
-  uint64_t full_dm = full_sm;
-  uint64_t device_mem = 0;
-  if (max_shared_memory < partial_sm) {
-    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                 (glwe_dimension + 1);
-  } else if (max_shared_memory < full_sm) {
-    device_mem = partial_dm * input_lwe_ciphertext_count * level_count *
-                 (glwe_dimension + 1);
-  }
-  uint64_t buffer_size = device_mem + (glwe_dimension + 1) * level_count *
-                                          input_lwe_ciphertext_count *
-                                          polynomial_size / 2 * sizeof(double2);
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void scratch_bootstrap_fast_low_latency(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
-      polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
-          polynomial_size);
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
-    cudaFuncSetCacheConfig(
-        device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
-    cudaFuncSetCacheConfig(
-        device_bootstrap_fast_low_latency<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-  if (allocate_gpu_memory) {
-    uint64_t buffer_size = get_buffer_size_bootstrap_fast_low_latency<Torus>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-    check_cuda_error(cudaGetLastError());
-  }
-}
-
-/*
- * Host wrapper to the low latency version
- * of bootstrapping
- */
-template <typename Torus, class params>
-__host__ void host_bootstrap_fast_low_latency(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
-    uint32_t max_shared_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  // With SM each block corresponds to either the mask or body, no need to
-  // duplicate data for each
-  uint64_t full_sm = get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(
-      polynomial_size);
-
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
-          polynomial_size);
-
-  uint64_t full_dm = full_sm;
-
-  uint64_t partial_dm = full_dm - partial_sm;
-
-  int8_t *d_mem = pbs_buffer;
-  double2 *buffer_fft =
-      (double2 *)d_mem +
-      (ptrdiff_t)(get_buffer_size_bootstrap_fast_low_latency<Torus>(
-                      glwe_dimension, polynomial_size, level_count,
-                      input_lwe_ciphertext_count, max_shared_memory) /
-                      sizeof(double2) -
-                  (glwe_dimension + 1) * level_count *
-                      input_lwe_ciphertext_count * polynomial_size / 2);
-
-  int thds = polynomial_size / params::opt;
-  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
-
-  void *kernel_args[12];
-  kernel_args[0] = &lwe_array_out;
-  kernel_args[1] = &lut_vector;
-  kernel_args[2] = &lut_vector_indexes;
-  kernel_args[3] = &lwe_array_in;
-  kernel_args[4] = &bootstrapping_key;
-  kernel_args[5] = &buffer_fft;
-  kernel_args[6] = &lwe_dimension;
-  kernel_args[7] = &polynomial_size;
-  kernel_args[8] = &base_log;
-  kernel_args[9] = &level_count;
-  kernel_args[10] = &d_mem;
-
-  if (max_shared_memory < partial_sm) {
-    kernel_args[11] = &full_dm;
-    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, grid,
-        thds, (void **)kernel_args, 0, *stream));
-  } else if (max_shared_memory < full_sm) {
-    kernel_args[11] = &partial_dm;
-    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
-        grid, thds, (void **)kernel_args, partial_sm, *stream));
-  } else {
-    int no_dm = 0;
-    kernel_args[11] = &no_dm;
-    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, grid,
-        thds, (void **)kernel_args, full_sm, *stream));
-  }
-
-  check_cuda_error(cudaGetLastError());
-}
-
-// Verify if the grid size for the low latency kernel satisfies the cooperative
-// group constraints
-template <typename Torus, class params>
-__host__ bool verify_cuda_bootstrap_fast_low_latency_grid_size(
-    int glwe_dimension, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
-
-  // If Cooperative Groups is not supported, no need to check anything else
-  if (!cuda_check_support_cooperative_groups())
-    return false;
-
-  // Calculate the dimension of the kernel
-  uint64_t full_sm =
-      get_buffer_size_full_sm_bootstrap_fast_low_latency<Torus>(params::degree);
-
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_fast_low_latency<Torus>(
-          params::degree);
-
-  int thds = params::degree / params::opt;
-
-  // Get the maximum number of active blocks per streaming multiprocessors
-  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
-  int max_active_blocks_per_sm;
-
-  if (max_shared_memory < partial_sm) {
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_per_sm,
-        (void *)device_bootstrap_fast_low_latency<Torus, params, NOSM>, thds,
-        0);
-  } else if (max_shared_memory < full_sm) {
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_per_sm,
-        (void *)device_bootstrap_fast_low_latency<Torus, params, PARTIALSM>,
-        thds, 0);
-  } else {
-    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &max_active_blocks_per_sm,
-        (void *)device_bootstrap_fast_low_latency<Torus, params, FULLSM>, thds,
-        0);
-  }
-
-  // Get the number of streaming multiprocessors
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
-}
-
-#endif // LOWLAT_FAST_PBS_H
--- a/backends/concrete-cuda/implementation/src/bootstrap_fast_multibit.cuh
+++ b/backends/concrete-cuda/implementation/src/bootstrap_fast_multibit.cuh
@@ -1,313 +0,0 @@
-#ifndef FASTMULTIBIT_PBS_H
-#define FASTMULTIBIT_PBS_H
-
-#include "bootstrap.h"
-#include "bootstrap_multibit.cuh"
-#include "bootstrap_multibit.h"
-#include "complex/operations.cuh"
-#include "cooperative_groups.h"
-#include "crypto/gadget.cuh"
-#include "crypto/ggsw.cuh"
-#include "crypto/torus.cuh"
-#include "device.h"
-#include "fft/bnsmfft.cuh"
-#include "fft/twiddles.cuh"
-#include "polynomial/functions.cuh"
-#include "polynomial/parameters.cuh"
-#include "polynomial/polynomial.cuh"
-#include "polynomial/polynomial_math.cuh"
-#include "utils/timer.cuh"
-#include <vector>
-
-template <typename Torus, class params>
-__global__ void device_multi_bit_bootstrap_fast_accumulate(
-    Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
-    Torus *lwe_array_in, double2 *keybundle_array, double2 *join_buffer,
-    Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
-    uint32_t keybundle_size_per_input) {
-
-  grid_group grid = this_grid();
-
-  // We use shared memory for the polynomials that are used often during the
-  // bootstrap, since shared memory is kept in L1 cache and accessing it is
-  // much faster than global memory
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-
-  selected_memory = sharedmem;
-
-  // We always compute the pointer with most restrictive alignment to avoid
-  // alignment issues
-  double2 *accumulator_fft = (double2 *)selected_memory;
-  Torus *accumulator =
-      (Torus *)accumulator_fft +
-      (ptrdiff_t)(sizeof(double2) * polynomial_size / 2 / sizeof(Torus));
-
-  // The third dimension of the block is used to determine on which ciphertext
-  // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
-
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
-
-  double2 *block_join_buffer =
-      &join_buffer[blockIdx.z * level_count * (glwe_dimension + 1) *
-                   params::degree / 2];
-
-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
-
-  double2 *keybundle = keybundle_array +
-                       // select the input
-                       blockIdx.z * keybundle_size_per_input;
-
-  if (lwe_offset == 0) {
-    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
-    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                          2 * params::degree);
-
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        global_slice, accumulator);
-  }
-
-  for (int i = 0; (i + lwe_offset) < lwe_dimension && i < lwe_chunk_size; i++) {
-    // Decompose the accumulator. Each block gets one level of the
-    // decomposition, for the mask and the body (so block 0 will have the
-    // accumulator decomposed at level 0, 1 at 1, etc.)
-    GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
-    gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-    // We are using the same memory space for accumulator_fft and
-    // accumulator_rotated, so we need to synchronize here to make sure they
-    // don't modify the same memory space at the same time
-    synchronize_threads_in_block();
-
-    // Perform G^-1(ACC) * GGSW -> GLWE
-    mul_ggsw_glwe<Torus, params>(accumulator, accumulator_fft,
-                                 block_join_buffer, keybundle, polynomial_size,
-                                 glwe_dimension, level_count, i, grid);
-
-    synchronize_threads_in_block();
-  }
-
-  if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
-    auto block_lwe_array_out =
-        &lwe_array_out[blockIdx.z * (glwe_dimension * polynomial_size + 1) +
-                       blockIdx.y * polynomial_size];
-
-    if (blockIdx.x == 0 && blockIdx.y < glwe_dimension) {
-      // Perform a sample extract. At this point, all blocks have the result,
-      // but we do the computation at block 0 to avoid waiting for extra blocks,
-      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-    } else if (blockIdx.x == 0 && blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-    }
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        accumulator, global_slice);
-  }
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_fast_multibit_bootstrap(uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size * 2; // accumulator
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_fast_multibit_bootstrap(
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t grouping_factor, uint32_t lwe_chunk_size,
-    uint32_t max_shared_memory) {
-
-  uint64_t buffer_size = 0;
-  buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
-                 (glwe_dimension + 1) * (glwe_dimension + 1) *
-                 (polynomial_size / 2) * sizeof(double2); // keybundle fft
-  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
-                 level_count * (polynomial_size / 2) *
-                 sizeof(double2); // join buffer
-  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
-                 polynomial_size * sizeof(Torus); // global_accumulator
-
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void scratch_fast_multi_bit_pbs(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t grouping_factor, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
-
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
-          polynomial_size);
-  uint64_t full_sm_accumulate =
-      get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
-
-  check_cuda_error(cudaFuncSetAttribute(
-      device_multi_bit_bootstrap_keybundle<Torus, params>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
-  cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
-                         cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
-
-  check_cuda_error(cudaFuncSetAttribute(
-      device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_accumulate));
-  cudaFuncSetCacheConfig(
-      device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
-      cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
-
-  if (allocate_gpu_memory) {
-    if (!lwe_chunk_size)
-      lwe_chunk_size =
-          get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
-                             input_lwe_ciphertext_count);
-
-    uint64_t buffer_size = get_buffer_size_fast_multibit_bootstrap<Torus>(
-        lwe_dimension, glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, lwe_chunk_size,
-        max_shared_memory);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-    check_cuda_error(cudaGetLastError());
-  }
-}
-
-template <typename Torus, typename STorus, class params>
-__host__ void host_fast_multi_bit_pbs(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, uint64_t *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size(lwe_dimension, level_count,
-                                        glwe_dimension, num_samples);
-
-  //
-  double2 *keybundle_fft = (double2 *)pbs_buffer;
-  double2 *buffer_fft = (double2 *)keybundle_fft +
-                        num_samples * lwe_chunk_size * level_count *
-                            (glwe_dimension + 1) * (glwe_dimension + 1) *
-                            (polynomial_size / 2);
-  Torus *global_accumulator =
-      (Torus *)buffer_fft +
-      (ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
-                  level_count * (polynomial_size / 2) / sizeof(Torus));
-
-  //
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
-          polynomial_size);
-  uint64_t full_sm_accumulate =
-      get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(polynomial_size);
-
-  uint32_t keybundle_size_per_input =
-      lwe_chunk_size * level_count * (glwe_dimension + 1) *
-      (glwe_dimension + 1) * (polynomial_size / 2);
-
-  //
-  void *kernel_args[16];
-  kernel_args[0] = &lwe_array_out;
-  kernel_args[1] = &lut_vector;
-  kernel_args[2] = &lut_vector_indexes;
-  kernel_args[3] = &lwe_array_in;
-  kernel_args[4] = &keybundle_fft;
-  kernel_args[5] = &buffer_fft;
-  kernel_args[6] = &global_accumulator;
-  kernel_args[7] = &lwe_dimension;
-  kernel_args[8] = &glwe_dimension;
-  kernel_args[9] = &polynomial_size;
-  kernel_args[10] = &base_log;
-  kernel_args[11] = &level_count;
-  kernel_args[12] = &grouping_factor;
-  kernel_args[15] = &keybundle_size_per_input;
-
-  //
-  dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
-  dim3 thds(polynomial_size / params::opt, 1, 1);
-
-  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
-       lwe_offset += lwe_chunk_size) {
-
-    uint32_t chunk_size = std::min(
-        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-
-    // Compute a keybundle
-    dim3 grid_keybundle(num_samples * chunk_size,
-                        (glwe_dimension + 1) * (glwe_dimension + 1),
-                        level_count);
-    device_multi_bit_bootstrap_keybundle<Torus, params>
-        <<<grid_keybundle, thds, full_sm_keybundle, *stream>>>(
-            lwe_array_in, keybundle_fft, bootstrapping_key, lwe_dimension,
-            glwe_dimension, polynomial_size, grouping_factor, base_log,
-            level_count, lwe_offset, chunk_size, keybundle_size_per_input);
-    check_cuda_error(cudaGetLastError());
-
-    kernel_args[13] = &lwe_offset;
-    kernel_args[14] = &chunk_size;
-
-    check_cuda_error(cudaLaunchCooperativeKernel(
-        (void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>,
-        grid_accumulate, thds, (void **)kernel_args, full_sm_accumulate,
-        *stream));
-  }
-}
-
-// Verify if the grid size for the low latency kernel satisfies the cooperative
-// group constraints
-template <typename Torus, class params>
-__host__ bool
-verify_cuda_bootstrap_fast_multi_bit_grid_size(int glwe_dimension,
-                                               int level_count, int num_samples,
-                                               uint32_t max_shared_memory) {
-
-  // If Cooperative Groups is not supported, no need to check anything else
-  if (!cuda_check_support_cooperative_groups())
-    return false;
-
-  // Calculate the dimension of the kernel
-  uint64_t full_sm =
-      get_buffer_size_full_sm_fast_multibit_bootstrap<Torus>(params::degree);
-
-  int thds = params::degree / params::opt;
-
-  // Get the maximum number of active blocks per streaming multiprocessors
-  int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
-  int max_active_blocks_per_sm;
-
-  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-      &max_active_blocks_per_sm,
-      (void *)device_multi_bit_bootstrap_fast_accumulate<Torus, params>, thds,
-      full_sm);
-
-  // Get the number of streaming multiprocessors
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  return number_of_blocks <= max_active_blocks_per_sm * number_of_sm;
-}
-#endif // FASTMULTIBIT_PBS_H
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cu
@@ -1,773 +0,0 @@
-#include <err.h>
-#include "bootstrap_fast_low_latency.cuh"
-#include "bootstrap_low_latency.cuh"
-/*
- * Returns the buffer size for 64 bits executions
- */
-uint64_t get_buffer_size_bootstrap_low_latency_64(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  switch (polynomial_size) {
-  case 256:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<256>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    else
-      return get_buffer_size_bootstrap_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    break;
-  case 512:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<512>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    else
-      return get_buffer_size_bootstrap_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    break;
-  case 1024:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<1024>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    else
-      return get_buffer_size_bootstrap_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    break;
-  case 2048:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<2048>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    else
-      return get_buffer_size_bootstrap_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    break;
-  case 4096:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<4096>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    else
-      return get_buffer_size_bootstrap_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    break;
-  case 8192:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<8192>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    else
-      return get_buffer_size_bootstrap_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    break;
-  case 16384:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
-            uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
-                                              input_lwe_ciphertext_count,
-                                              max_shared_memory))
-      return get_buffer_size_bootstrap_fast_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    else
-      return get_buffer_size_bootstrap_low_latency<uint64_t>(
-          glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    return 0;
-    break;
-  }
-}
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_fast_bootstrap_low_latency(int glwe_dimension, int level_count,
-                                       int polynomial_size, int num_samples) {
-
-  assert((
-      "Error (GPU low latency PBS): polynomial size should be one of 256, 512, "
-      "1024, 2048, 4096, 8192, 16384",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192 ||
-          polynomial_size == 16384));
-}
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_bootstrap_low_latency(int nbits, int glwe_dimension,
-                                  int level_count, int base_log,
-                                  int polynomial_size, int num_samples) {
-  assert(("Error (GPU low latency PBS): base log should be <= nbits",
-          base_log <= nbits));
-  checks_fast_bootstrap_low_latency(glwe_dimension, level_count,
-                                    polynomial_size, num_samples);
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the low latency PBS on 32 bits inputs, into `pbs_buffer`. It also
- * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
- * be used.
- */
-void scratch_cuda_bootstrap_low_latency_32(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  checks_fast_bootstrap_low_latency(
-      glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
-
-  switch (polynomial_size) {
-  case 256:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<256>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
-                                         AmortizedDegree<256>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<256>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 512:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<512>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
-                                         AmortizedDegree<512>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<512>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 2048:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<2048>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
-                                         AmortizedDegree<2048>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<2048>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 4096:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<4096>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
-                                         AmortizedDegree<4096>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<4096>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 8192:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<8192>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
-                                         AmortizedDegree<8192>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<8192>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 16384:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
-            uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
-                                              input_lwe_ciphertext_count,
-                                              max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint32_t, int32_t,
-                                         AmortizedDegree<16384>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint32_t, int32_t, Degree<16384>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the low_latency PBS on 64 bits inputs, into `pbs_buffer`. It also
- * configures SM options on the GPU in case FULLSM or PARTIALSM mode is going to
- * be used.
- */
-void scratch_cuda_bootstrap_low_latency_64(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-
-  checks_fast_bootstrap_low_latency(
-      glwe_dimension, level_count, polynomial_size, input_lwe_ciphertext_count);
-
-  switch (polynomial_size) {
-  case 256:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<256>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
-                                         AmortizedDegree<256>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<256>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 512:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<512>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
-                                         AmortizedDegree<512>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<512>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 1024:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<1024>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
-                                         AmortizedDegree<1024>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<1024>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 2048:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<2048>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
-                                         AmortizedDegree<2048>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<2048>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 4096:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<4096>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
-                                         AmortizedDegree<4096>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<4096>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 8192:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<8192>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
-                                         AmortizedDegree<8192>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<8192>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  case 16384:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
-            uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
-                                              input_lwe_ciphertext_count,
-                                              max_shared_memory))
-      scratch_bootstrap_fast_low_latency<uint64_t, int64_t,
-                                         AmortizedDegree<16384>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    else
-      scratch_bootstrap_low_latency<uint64_t, int64_t, Degree<16384>>(
-          v_stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-/* Perform bootstrapping on a batch of input u32 LWE ciphertexts.
- * This function performs best for small numbers of inputs. Beyond a certain
- * number of inputs (the exact number depends on the cryptographic parameters),
- * the kernel cannot be launched and it is necessary to split the kernel call
- * into several calls on smaller batches of inputs. For more details on this
- * operation, head on to the equivalent u64 operation.
- */
-void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
-
-  checks_bootstrap_low_latency(32, glwe_dimension, level_count, base_log,
-                               polynomial_size, num_samples);
-
-  switch (polynomial_size) {
-  case 256:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<256>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<256>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint32_t, Degree<256>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 512:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<512>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<512>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint32_t, Degree<512>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 1024:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<1024>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<1024>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint32_t, Degree<1024>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 2048:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<2048>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<2048>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint32_t, Degree<2048>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 4096:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<4096>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<4096>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint32_t, Degree<4096>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 8192:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<8192>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<8192>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint32_t, Degree<8192>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 16384:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
-            uint32_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
-                                              num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint32_t, AmortizedDegree<16384>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint32_t, Degree<16384>>(
-          v_stream, gpu_index, (uint32_t *)lwe_array_out,
-          (uint32_t *)lut_vector, (uint32_t *)lut_vector_indexes,
-          (uint32_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-/* Perform bootstrapping on a batch of input u64 LWE ciphertexts.
- * This function performs best for small numbers of inputs. Beyond a certain
- * number of inputs (the exact number depends on the cryptographic parameters),
- * the kernel cannot be launched and it is necessary to split the kernel call
- * into several calls on smaller batches of inputs.
- *
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - lwe_array_out: output batch of num_samples bootstrapped ciphertexts c =
- * (a0,..an-1,b) where n is the LWE dimension
- *  - lut_vector: should hold as many test vectors of size polynomial_size
- * as there are input ciphertexts, but actually holds
- * num_lut_vectors vectors to reduce memory usage
- *  - lut_vector_indexes: stores the index corresponding to
- * which test vector to use for each sample in
- * lut_vector
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing n
- * mask values + 1 body value
- *  - bootstrapping_key: GGSW encryption of the LWE secret key sk1
- * under secret key sk2
- * bsk = Z + sk1 H
- * where H is the gadget matrix and Z is a matrix (k+1).l
- * containing GLWE encryptions of 0 under sk2.
- * bsk is thus a tensor of size (k+1)^2.l.N.n
- * where l is the number of decomposition levels and
- * k is the GLWE dimension, N is the polynomial size for
- * GLWE. The polynomial size for GLWE and the test vector
- * are the same because they have to be in the same ring
- * to be multiplied.
- * - lwe_dimension: size of the Torus vector used to encrypt the input
- * LWE ciphertexts - referred to as n above (~ 600)
- * - glwe_dimension: size of the polynomial vector used to encrypt the LUT
- * GLWE ciphertexts - referred to as k above. Only the value 1 is supported for
- * this parameter.
- * - polynomial_size: size of the test polynomial (test vector) and size of the
- * GLWE polynomial (~1024)
- * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
- * - level_count: number of decomposition levels in the gadget matrix (~4)
- * - num_samples: number of encrypted input messages
- * - num_lut_vectors: parameter to set the actual number of test vectors to be
- * used
- * - lwe_idx: the index of the LWE input to consider for the GPU of index
- * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
- * the input LWE array is copied to each GPU, but the whole LUT array is copied
- * (because the case when the number of LUTs is smaller than the number of input
- * LWEs is not trivial to take into account in the data repartition on the
- * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
- * input in the LUT array `lut_vector`.
- *  - 'max_shared_memory' maximum amount of shared memory to be used inside
- * device functions
- *
- * This function calls a wrapper to a device kernel that performs the
- * bootstrapping:
- * 	- the kernel is templatized based on integer discretization and
- * polynomial degree
- * 	- num_samples * level_count * (glwe_dimension + 1) blocks of threads are
- * launched, where each thread	is going to handle one or more polynomial
- * coefficients at each stage, for a given level of decomposition, either for
- * the LUT mask or its body:
- * 		- perform the blind rotation
- * 		- round the result
- * 		- get the decomposition for the current level
- * 		- switch to the FFT domain
- * 		- multiply with the bootstrapping key
- * 		- come back to the coefficients representation
- * 	- between each stage a synchronization of the threads is necessary (some
- * synchronizations happen at the block level, some happen between blocks, using
- * cooperative groups).
- * 	- in case the device has enough shared memory, temporary arrays used for
- * the different stages (accumulators) are stored into the shared memory
- * 	- the accumulators serve to combine the results for all decomposition
- * levels
- * 	- the constant memory (64K) is used for storing the roots of identity
- * values for the FFT
- */
-void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
-
-  checks_bootstrap_low_latency(64, glwe_dimension, level_count, base_log,
-                               polynomial_size, num_samples);
-
-  switch (polynomial_size) {
-  case 256:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<256>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<256>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint64_t, Degree<256>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 512:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint64_t,
-                                                         AmortizedDegree<512>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<512>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint64_t, Degree<512>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 1024:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<1024>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<1024>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint64_t, Degree<1024>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 2048:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<2048>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<2048>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint64_t, Degree<2048>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 4096:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<4096>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<4096>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint64_t, Degree<4096>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 8192:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<uint32_t,
-                                                         AmortizedDegree<8192>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<8192>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint64_t, Degree<8192>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  case 16384:
-    if (verify_cuda_bootstrap_fast_low_latency_grid_size<
-            uint64_t, AmortizedDegree<16384>>(glwe_dimension, level_count,
-                                              num_samples, max_shared_memory))
-      host_bootstrap_fast_low_latency<uint64_t, AmortizedDegree<16384>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    else
-      host_bootstrap_low_latency<uint64_t, Degree<16384>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (double2 *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-          num_samples, num_lut_vectors, max_shared_memory);
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-/*
- * This cleanup function frees the data for the low latency PBS on GPU in
- * pbs_buffer for 32 or 64 bits inputs.
- */
-void cleanup_cuda_bootstrap_low_latency(void *v_stream, uint32_t gpu_index,
-                                        int8_t **pbs_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*pbs_buffer, stream, gpu_index);
-}
--- a/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh
+++ b/backends/concrete-cuda/implementation/src/bootstrap_low_latency.cuh
@@ -1,490 +0,0 @@
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#ifndef LOWLAT_PBS_H
-#define LOWLAT_PBS_H
-
-#include "bootstrap.h"
-#include "complex/operations.cuh"
-#include "crypto/gadget.cuh"
-#include "crypto/torus.cuh"
-#include "device.h"
-#include "fft/bnsmfft.cuh"
-#include "fft/twiddles.cuh"
-#include "polynomial/parameters.cuh"
-#include "polynomial/polynomial.cuh"
-#include "polynomial/polynomial_math.cuh"
-#include "utils/timer.cuh"
-
-template <typename Torus, class params, sharedMemDegree SMD>
-__global__ void device_bootstrap_low_latency_step_one(
-    Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
-    Torus *lwe_array_in, double2 *bootstrapping_key, Torus *global_accumulator,
-    double2 *global_accumulator_fft, uint32_t lwe_iteration,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block) {
-
-  // We use shared memory for the polynomials that are used often during the
-  // bootstrap, since shared memory is kept in L1 cache and accessing it is
-  // much faster than global memory
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-  uint32_t glwe_dimension = gridDim.y - 1;
-
-  if constexpr (SMD == FULLSM) {
-    selected_memory = sharedmem;
-  } else {
-    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
-                      blockIdx.z * gridDim.x * gridDim.y;
-    selected_memory = &device_mem[block_index * device_memory_size_per_block];
-  }
-
-  Torus *accumulator = (Torus *)selected_memory;
-  double2 *accumulator_fft =
-      (double2 *)accumulator +
-      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
-
-  if constexpr (SMD == PARTIALSM)
-    accumulator_fft = (double2 *)sharedmem;
-
-  // The third dimension of the block is used to determine on which ciphertext
-  // this block is operating, in the case of batch bootstraps
-  Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
-
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
-
-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
-
-  double2 *global_fft_slice =
-      global_accumulator_fft +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
-       blockIdx.z * level_count * (glwe_dimension + 1)) *
-          (polynomial_size / 2);
-
-  if (lwe_iteration == 0) {
-    // First iteration
-    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
-    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                          2 * params::degree);
-    // The y-dimension is used to select the element of the GLWE this block will
-    // compute
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
-
-    // Persist
-    int tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      global_slice[tid] = accumulator[tid];
-      tid += params::degree / params::opt;
-    }
-  }
-
-  // Put "a" in [0, 2N[
-  Torus a_hat = 0;
-  rescale_torus_element(block_lwe_array_in[lwe_iteration], a_hat,
-                        2 * params::degree); // 2 * params::log2_degree + 1);
-
-  synchronize_threads_in_block();
-
-  // Perform ACC * (X^ä - 1)
-  multiply_by_monomial_negacyclic_and_sub_polynomial<
-      Torus, params::opt, params::degree / params::opt>(global_slice,
-                                                        accumulator, a_hat);
-
-  // Perform a rounding to increase the accuracy of the
-  // bootstrapped ciphertext
-  round_to_closest_multiple_inplace<Torus, params::opt,
-                                    params::degree / params::opt>(
-      accumulator, base_log, level_count);
-
-  synchronize_threads_in_block();
-
-  // Decompose the accumulator. Each block gets one level of the
-  // decomposition, for the mask and the body (so block 0 will have the
-  // accumulator decomposed at level 0, 1 at 1, etc.)
-  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
-  gadget_acc.decompose_and_compress_level(accumulator_fft, blockIdx.x);
-
-  // We are using the same memory space for accumulator_fft and
-  // accumulator_rotated, so we need to synchronize here to make sure they
-  // don't modify the same memory space at the same time
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
-
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    global_fft_slice[tid] = accumulator_fft[tid];
-    tid += params::degree / params::opt;
-  }
-}
-
-template <typename Torus, class params, sharedMemDegree SMD>
-__global__ void device_bootstrap_low_latency_step_two(
-    Torus *lwe_array_out, Torus *lut_vector, Torus *lut_vector_indexes,
-    Torus *lwe_array_in, double2 *bootstrapping_key, Torus *global_accumulator,
-    double2 *global_accumulator_fft, uint32_t lwe_iteration,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block) {
-
-  // We use shared memory for the polynomials that are used often during the
-  // bootstrap, since shared memory is kept in L1 cache and accessing it is
-  // much faster than global memory
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-  uint32_t glwe_dimension = gridDim.y - 1;
-
-  if constexpr (SMD == FULLSM) {
-    selected_memory = sharedmem;
-  } else {
-    int block_index = blockIdx.x + blockIdx.y * gridDim.x +
-                      blockIdx.z * gridDim.x * gridDim.y;
-    selected_memory = &device_mem[block_index * device_memory_size_per_block];
-  }
-
-  // We always compute the pointer with most restrictive alignment to avoid
-  // alignment issues
-  double2 *accumulator_fft = (double2 *)selected_memory;
-  Torus *accumulator =
-      (Torus *)accumulator_fft +
-      (ptrdiff_t)(sizeof(double2) * params::degree / 2 / sizeof(Torus));
-
-  if constexpr (SMD == PARTIALSM)
-    accumulator_fft = (double2 *)sharedmem;
-
-  for (int level = 0; level < level_count; level++) {
-    double2 *global_fft_slice = global_accumulator_fft +
-                                (level + blockIdx.x * level_count) *
-                                    (glwe_dimension + 1) * (params::degree / 2);
-
-    for (int j = 0; j < (glwe_dimension + 1); j++) {
-      double2 *fft = global_fft_slice + j * params::degree / 2;
-
-      // Get the bootstrapping key piece necessary for the multiplication
-      // It is already in the Fourier domain
-      auto bsk_slice =
-          get_ith_mask_kth_block(bootstrapping_key, lwe_iteration, j, level,
-                                 polynomial_size, glwe_dimension, level_count);
-      auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
-
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          accumulator_fft, fft, bsk_poly, !level && !j);
-    }
-  }
-
-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
-
-  // Load the persisted accumulator
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
-    accumulator[tid] = global_slice[tid];
-    tid += params::degree / params::opt;
-  }
-
-  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
-  // accumulator
-  NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
-  add_to_torus<Torus, params>(accumulator_fft, accumulator);
-
-  if (lwe_iteration + 1 == lwe_dimension) {
-    // Last iteration
-    auto block_lwe_array_out =
-        &lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1) +
-                       blockIdx.y * polynomial_size];
-
-    if (blockIdx.y < glwe_dimension) {
-      // Perform a sample extract. At this point, all blocks have the result,
-      // but we do the computation at block 0 to avoid waiting for extra blocks,
-      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, accumulator);
-    } else if (blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, accumulator, 0);
-    }
-  } else {
-    // Persist the updated accumulator
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      global_slice[tid] = accumulator[tid];
-      tid += params::degree / params::opt;
-    }
-  }
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_bootstrap_low_latency_step_one(
-    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator_rotated
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_bootstrap_low_latency_step_two(
-    uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size +      // accumulator
-         sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_partial_sm_bootstrap_low_latency(uint32_t polynomial_size) {
-  return sizeof(double2) * polynomial_size / 2; // accumulator fft
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_bootstrap_low_latency(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
-  uint64_t full_sm_step_one =
-      get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
-          polynomial_size);
-  uint64_t full_sm_step_two =
-      get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
-          polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
-
-  uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
-  uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
-  uint64_t full_dm = full_sm_step_one;
-
-  uint64_t device_mem = 0;
-  if (max_shared_memory < partial_sm) {
-    device_mem = full_dm * input_lwe_ciphertext_count * level_count *
-                 (glwe_dimension + 1);
-  } else if (max_shared_memory < full_sm_step_two) {
-    device_mem = (partial_dm_step_two + partial_dm_step_one * level_count) *
-                 input_lwe_ciphertext_count * (glwe_dimension + 1);
-  } else if (max_shared_memory < full_sm_step_one) {
-    device_mem = partial_dm_step_one * input_lwe_ciphertext_count *
-                 level_count * (glwe_dimension + 1);
-  }
-  // Otherwise, both kernels run all in shared memory
-  uint64_t buffer_size = device_mem +
-                         // global_accumulator_fft
-                         (glwe_dimension + 1) * level_count *
-                             input_lwe_ciphertext_count *
-                             (polynomial_size / 2) * sizeof(double2) +
-                         // global_accumulator
-                         (glwe_dimension + 1) * input_lwe_ciphertext_count *
-                             polynomial_size * sizeof(Torus);
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void scratch_bootstrap_low_latency(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t full_sm_step_one =
-      get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
-          polynomial_size);
-  uint64_t full_sm_step_two =
-      get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
-          polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
-
-  // Configure step one
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
-    cudaFuncSetCacheConfig(
-        device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one));
-    cudaFuncSetCacheConfig(
-        device_bootstrap_low_latency_step_one<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
-  // Configure step two
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
-    cudaFuncSetCacheConfig(
-        device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two));
-    cudaFuncSetCacheConfig(
-        device_bootstrap_low_latency_step_two<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
-  if (allocate_gpu_memory) {
-    uint64_t buffer_size = get_buffer_size_bootstrap_low_latency<Torus>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-    check_cuda_error(cudaGetLastError());
-  }
-}
-
-template <typename Torus, class params>
-__host__ void execute_low_latency_step_one(
-    void *v_stream, Torus *lwe_array_out, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
-    int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
-    uint64_t full_sm, uint64_t full_dm) {
-
-  int thds = polynomial_size / params::opt;
-  dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  if (max_shared_memory < partial_sm) {
-    device_bootstrap_low_latency_step_one<Torus, params, NOSM>
-        <<<grid, thds, 0, *stream>>>(
-            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
-            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm);
-  } else if (max_shared_memory < full_sm) {
-    device_bootstrap_low_latency_step_one<Torus, params, PARTIALSM>
-        <<<grid, thds, partial_sm, *stream>>>(
-            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
-            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm);
-  } else {
-    device_bootstrap_low_latency_step_one<Torus, params, FULLSM>
-        <<<grid, thds, full_sm, *stream>>>(
-            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
-            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0);
-  }
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename Torus, class params>
-__host__ void execute_low_latency_step_two(
-    void *v_stream, Torus *lwe_array_out, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, int8_t *d_mem, uint32_t max_shared_memory,
-    int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
-    uint64_t full_sm, uint64_t full_dm) {
-
-  int thds = polynomial_size / params::opt;
-  dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  if (max_shared_memory < partial_sm) {
-    device_bootstrap_low_latency_step_two<Torus, params, NOSM>
-        <<<grid, thds, 0, *stream>>>(
-            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
-            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm);
-  } else if (max_shared_memory < full_sm) {
-    device_bootstrap_low_latency_step_two<Torus, params, PARTIALSM>
-        <<<grid, thds, partial_sm, *stream>>>(
-            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
-            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm);
-  } else {
-    device_bootstrap_low_latency_step_two<Torus, params, FULLSM>
-        <<<grid, thds, full_sm, *stream>>>(
-            lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-            bootstrapping_key, global_accumulator, global_accumulator_fft,
-            lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0);
-  }
-  check_cuda_error(cudaGetLastError());
-}
-/*
- * Host wrapper to the low latency version
- * of bootstrapping
- */
-template <typename Torus, class params>
-__host__ void host_bootstrap_low_latency(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, double2 *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_lut_vectors,
-    uint32_t max_shared_memory) {
-  cudaSetDevice(gpu_index);
-
-  // With SM each block corresponds to either the mask or body, no need to
-  // duplicate data for each
-  uint64_t full_sm_step_one =
-      get_buffer_size_full_sm_bootstrap_low_latency_step_one<Torus>(
-          polynomial_size);
-  uint64_t full_sm_step_two =
-      get_buffer_size_full_sm_bootstrap_low_latency_step_two<Torus>(
-          polynomial_size);
-
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_bootstrap_low_latency<Torus>(polynomial_size);
-
-  uint64_t partial_dm_step_one = full_sm_step_one - partial_sm;
-  uint64_t partial_dm_step_two = full_sm_step_two - partial_sm;
-  uint64_t full_dm_step_one = full_sm_step_one;
-  uint64_t full_dm_step_two = full_sm_step_two;
-
-  double2 *global_accumulator_fft = (double2 *)pbs_buffer;
-  Torus *global_accumulator =
-      (Torus *)global_accumulator_fft +
-      (ptrdiff_t)(sizeof(double2) * (glwe_dimension + 1) * level_count *
-                  input_lwe_ciphertext_count * (polynomial_size / 2) /
-                  sizeof(Torus));
-  int8_t *d_mem = (int8_t *)global_accumulator +
-                  (ptrdiff_t)(sizeof(Torus) * (glwe_dimension + 1) *
-                              input_lwe_ciphertext_count * polynomial_size /
-                              sizeof(int8_t));
-
-  for (int i = 0; i < lwe_dimension; i++) {
-    execute_low_latency_step_one<Torus, params>(
-        v_stream, lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-        bootstrapping_key, global_accumulator, global_accumulator_fft,
-        input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
-        polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
-        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
-    execute_low_latency_step_two<Torus, params>(
-        v_stream, lwe_array_out, lut_vector, lut_vector_indexes, lwe_array_in,
-        bootstrapping_key, global_accumulator, global_accumulator_fft,
-        input_lwe_ciphertext_count, lwe_dimension, glwe_dimension,
-        polynomial_size, base_log, level_count, d_mem, max_shared_memory, i,
-        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two);
-  }
-}
-
-#endif // LOWLAT_PBS_H
--- a/backends/concrete-cuda/implementation/src/bootstrap_multibit.cu
+++ b/backends/concrete-cuda/implementation/src/bootstrap_multibit.cu
@@ -1,399 +0,0 @@
-#include <err.h>
-#include "bootstrap_fast_multibit.cuh"
-#include "bootstrap_multibit.cuh"
-#include "bootstrap_multibit.h"
-#include "polynomial/parameters.cuh"
-
-void checks_multi_bit_pbs(int polynomial_size) {
-  assert(
-      ("Error (GPU multi-bit PBS): polynomial size should be one of 256, 512, "
-       "1024, 2048, 4096, 8192, 16384",
-       polynomial_size == 256 || polynomial_size == 512 ||
-           polynomial_size == 1024 || polynomial_size == 2048 ||
-           polynomial_size == 4096 || polynomial_size == 8192 ||
-           polynomial_size == 16384));
-}
-
-void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lut_vector,
-    void *lut_vector_indexes, void *lwe_array_in, void *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
-
-  checks_multi_bit_pbs(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<256>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
-    break;
-  case 512:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<512>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
-    break;
-  case 1024:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<1024>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
-    break;
-  case 2048:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<2048>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
-    break;
-  case 4096:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<4096>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
-    break;
-  case 8192:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<8192>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
-    break;
-  case 16384:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<16384>>(
-            glwe_dimension, level_count, num_samples, max_shared_memory)) {
-      host_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    } else {
-      host_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
-          v_stream, gpu_index, (uint64_t *)lwe_array_out,
-          (uint64_t *)lut_vector, (uint64_t *)lut_vector_indexes,
-          (uint64_t *)lwe_array_in, (uint64_t *)bootstrapping_key, pbs_buffer,
-          glwe_dimension, lwe_dimension, polynomial_size, grouping_factor,
-          base_log, level_count, num_samples, num_lut_vectors, lwe_idx,
-          max_shared_memory, lwe_chunk_size);
-    }
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-void scratch_cuda_multi_bit_pbs_64(
-    void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
-
-  switch (polynomial_size) {
-  case 256:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<256>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<256>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    }
-    break;
-  case 512:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<512>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<512>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    }
-    break;
-  case 1024:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<1024>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<1024>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    }
-    break;
-  case 2048:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<2048>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<2048>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    }
-    break;
-  case 4096:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<4096>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<4096>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    }
-    break;
-  case 8192:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<8192>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<8192>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    }
-    break;
-  case 16384:
-    if (verify_cuda_bootstrap_fast_multi_bit_grid_size<uint64_t,
-                                                       AmortizedDegree<16384>>(
-            glwe_dimension, level_count, input_lwe_ciphertext_count,
-            max_shared_memory)) {
-      scratch_fast_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    } else {
-      scratch_multi_bit_pbs<uint64_t, int64_t, AmortizedDegree<16384>>(
-          v_stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
-          polynomial_size, level_count, input_lwe_ciphertext_count,
-          grouping_factor, max_shared_memory, allocate_gpu_memory,
-          lwe_chunk_size);
-    }
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-}
-
-void cleanup_cuda_multi_bit_pbs(void *v_stream, uint32_t gpu_index,
-                                int8_t **pbs_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*pbs_buffer, stream, gpu_index);
-}
-
-__host__ uint32_t get_lwe_chunk_size(uint32_t lwe_dimension,
-                                     uint32_t level_count,
-                                     uint32_t glwe_dimension,
-                                     uint32_t num_samples) {
-
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, 0); // Assuming device 0
-
-  const char *v100Name = "V100"; // Known name of V100 GPU
-  const char *a100Name = "A100"; // Known name of V100 GPU
-
-  if (std::strstr(deviceProp.name, v100Name) != nullptr) {
-    // Tesla V100
-    if (num_samples < 16)
-      return 80 / num_samples;
-    else if (num_samples == 16)
-      return 40;
-    else if (num_samples < 1024)
-      return 20;
-    else if (num_samples < 8192)
-      return 10;
-  } else if (std::strstr(deviceProp.name, a100Name) != nullptr) {
-    // Tesla A100
-    if (num_samples < 4)
-      return 11;
-    else if (num_samples < 8)
-      return 6;
-    else if (num_samples < 16)
-      return 13;
-    else if (num_samples < 64)
-      return 19;
-    else if (num_samples < 128)
-      return 1;
-    else if (num_samples < 512)
-      return 19;
-    else if (num_samples < 1024)
-      return 17;
-    else if (num_samples < 8192)
-      return 19;
-    else if (num_samples < 16384)
-      return 12;
-    else
-      return 9;
-  }
-
-  // Generic case
-  return 1;
-}
-
-// Returns the maximum buffer size required to execute batches up to
-// max_input_lwe_ciphertext_count
-__host__ uint64_t get_max_buffer_size_multibit_bootstrap(
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_input_lwe_ciphertext_count) {
-
-  uint64_t max_buffer_size = 0;
-  for (uint32_t input_lwe_ciphertext_count = 1;
-       input_lwe_ciphertext_count <= max_input_lwe_ciphertext_count;
-       input_lwe_ciphertext_count++) {
-    max_buffer_size = std::max(
-        max_buffer_size,
-        get_buffer_size_multibit_bootstrap<uint64_t>(
-            glwe_dimension, polynomial_size, level_count,
-            input_lwe_ciphertext_count,
-            get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
-                               input_lwe_ciphertext_count)));
-  }
-
-  return max_buffer_size;
-}
--- a/backends/concrete-cuda/implementation/src/bootstrap_multibit.cuh
+++ b/backends/concrete-cuda/implementation/src/bootstrap_multibit.cuh
@@ -1,467 +0,0 @@
-#ifndef MULTIBIT_PBS_H
-#define MULTIBIT_PBS_H
-
-#include "bootstrap.h"
-#include "bootstrap_fast_low_latency.cuh"
-#include "bootstrap_multibit.h"
-#include "complex/operations.cuh"
-#include "cooperative_groups.h"
-#include "crypto/gadget.cuh"
-#include "crypto/ggsw.cuh"
-#include "crypto/torus.cuh"
-#include "device.h"
-#include "fft/bnsmfft.cuh"
-#include "fft/twiddles.cuh"
-#include "polynomial/functions.cuh"
-#include "polynomial/parameters.cuh"
-#include "polynomial/polynomial.cuh"
-#include "polynomial/polynomial_math.cuh"
-#include "utils/timer.cuh"
-#include <vector>
-
-template <typename Torus, class params>
-__device__ Torus calculates_monomial_degree(Torus *lwe_array_group,
-                                            uint32_t ggsw_idx,
-                                            uint32_t grouping_factor) {
-  Torus x = 0;
-  for (int i = 0; i < grouping_factor; i++) {
-    uint32_t mask_position = grouping_factor - (i + 1);
-    int selection_bit = (ggsw_idx >> mask_position) & 1;
-    x += selection_bit * lwe_array_group[i];
-  }
-
-  return rescale_torus_element(
-      x, 2 * params::degree); // 2 * params::log2_degree + 1);
-}
-
-template <typename Torus, class params>
-__global__ void device_multi_bit_bootstrap_keybundle(
-    Torus *lwe_array_in, double2 *keybundle_array, Torus *bootstrapping_key,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_offset, uint32_t lwe_chunk_size,
-    uint32_t keybundle_size_per_input) {
-
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory = sharedmem;
-
-  // Ids
-  uint32_t level_id = blockIdx.z;
-  uint32_t glwe_id = blockIdx.y / (glwe_dimension + 1);
-  uint32_t poly_id = blockIdx.y % (glwe_dimension + 1);
-  uint32_t lwe_iteration = (blockIdx.x % lwe_chunk_size + lwe_offset);
-  uint32_t input_idx = blockIdx.x / lwe_chunk_size;
-
-  if (lwe_iteration < (lwe_dimension / grouping_factor)) {
-    //
-    Torus *accumulator = (Torus *)selected_memory;
-
-    Torus *block_lwe_array_in = &lwe_array_in[input_idx * (lwe_dimension + 1)];
-
-    double2 *keybundle = keybundle_array +
-                         // select the input
-                         input_idx * keybundle_size_per_input;
-
-    ////////////////////////////////////////////////////////////
-    // Computes all keybundles
-    uint32_t rev_lwe_iteration =
-        ((lwe_dimension / grouping_factor) - lwe_iteration - 1);
-
-    // ////////////////////////////////
-    // Keygen guarantees the first term is a constant term of the polynomial, no
-    // polynomial multiplication required
-    Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
-        bootstrapping_key, 0, rev_lwe_iteration, glwe_id, level_id,
-        grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-    Torus *bsk_poly = bsk_slice + poly_id * params::degree;
-
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        bsk_poly, accumulator);
-
-    // Accumulate the other terms
-    for (int g = 1; g < (1 << grouping_factor); g++) {
-
-      Torus *bsk_slice = get_multi_bit_ith_lwe_gth_group_kth_block(
-          bootstrapping_key, g, rev_lwe_iteration, glwe_id, level_id,
-          grouping_factor, 2 * polynomial_size, glwe_dimension, level_count);
-      Torus *bsk_poly = bsk_slice + poly_id * params::degree;
-
-      // Calculates the monomial degree
-      Torus *lwe_array_group =
-          block_lwe_array_in + rev_lwe_iteration * grouping_factor;
-      uint32_t monomial_degree = calculates_monomial_degree<Torus, params>(
-          lwe_array_group, g, grouping_factor);
-
-      synchronize_threads_in_block();
-      // Multiply by the bsk element
-      polynomial_product_accumulate_by_monomial<Torus, params>(
-          accumulator, bsk_poly, monomial_degree, false);
-    }
-    synchronize_threads_in_block();
-
-    double2 *fft = (double2 *)sharedmem;
-
-    // Move accumulator to local memory
-    double2 temp[params::opt / 2];
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      temp[i].x = __ll2double_rn((int64_t)accumulator[tid]);
-      temp[i].y =
-          __ll2double_rn((int64_t)accumulator[tid + params::degree / 2]);
-      temp[i].x /= (double)std::numeric_limits<Torus>::max();
-      temp[i].y /= (double)std::numeric_limits<Torus>::max();
-      tid += params::degree / params::opt;
-    }
-
-    synchronize_threads_in_block();
-    // Move from local memory back to shared memory but as complex
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt / 2; i++) {
-      fft[tid] = temp[i];
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-    NSMFFT_direct<HalfDegree<params>>(fft);
-
-    // lwe iteration
-    auto keybundle_out = get_ith_mask_kth_block(
-        keybundle, blockIdx.x % lwe_chunk_size, glwe_id, level_id,
-        polynomial_size, glwe_dimension, level_count);
-    auto keybundle_poly = keybundle_out + poly_id * params::degree / 2;
-
-    copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
-        fft, keybundle_poly);
-  }
-}
-
-template <typename Torus, class params>
-__global__ void device_multi_bit_bootstrap_accumulate_step_one(
-    Torus *lwe_array_in, Torus *lut_vector, Torus *lut_vector_indexes,
-    Torus *global_accumulator, double2 *global_accumulator_fft,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t lwe_iteration) {
-
-  // We use shared memory for the polynomials that are used often during the
-  // bootstrap, since shared memory is kept in L1 cache and accessing it is
-  // much faster than global memory
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-
-  selected_memory = sharedmem;
-
-  Torus *accumulator = (Torus *)selected_memory;
-  double2 *accumulator_fft =
-      (double2 *)accumulator +
-      (ptrdiff_t)(sizeof(Torus) * polynomial_size / sizeof(double2));
-
-  Torus *block_lwe_array_in = &lwe_array_in[blockIdx.z * (lwe_dimension + 1)];
-
-  Torus *block_lut_vector = &lut_vector[lut_vector_indexes[blockIdx.z] *
-                                        params::degree * (glwe_dimension + 1)];
-
-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.z * (glwe_dimension + 1)) * params::degree;
-
-  double2 *global_fft_slice =
-      global_accumulator_fft +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1) +
-       blockIdx.z * level_count * (glwe_dimension + 1)) *
-          (polynomial_size / 2);
-
-  if (lwe_iteration == 0) {
-    // First iteration
-    ////////////////////////////////////////////////////////////
-    // Initializes the accumulator with the body of LWE
-    // Put "b" in [0, 2N[
-    Torus b_hat = 0;
-    rescale_torus_element(block_lwe_array_in[lwe_dimension], b_hat,
-                          2 * params::degree);
-
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                          params::degree / params::opt>(
-        accumulator, &block_lut_vector[blockIdx.y * params::degree], b_hat,
-        false);
-
-    // Persist
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        accumulator, global_slice);
-  } else {
-    // Load the accumulator calculated in previous iterations
-    copy_polynomial<Torus, params::opt, params::degree / params::opt>(
-        global_slice, accumulator);
-  }
-
-  // Decompose the accumulator. Each block gets one level of the
-  // decomposition, for the mask and the body (so block 0 will have the
-  // accumulator decomposed at level 0, 1 at 1, etc.)
-  GadgetMatrix<Torus, params> gadget_acc(base_log, level_count, accumulator);
-  gadget_acc.decompose_and_compress_next_polynomial(accumulator_fft,
-                                                    blockIdx.x);
-
-  // We are using the same memory space for accumulator_fft and
-  // accumulator_rotated, so we need to synchronize here to make sure they
-  // don't modify the same memory space at the same time
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(accumulator_fft);
-
-  copy_polynomial<double2, params::opt / 2, params::degree / params::opt>(
-      accumulator_fft, global_fft_slice);
-}
-
-template <typename Torus, class params>
-__global__ void device_multi_bit_bootstrap_accumulate_step_two(
-    Torus *lwe_array_out, double2 *keybundle_array, Torus *global_accumulator,
-    double2 *global_accumulator_fft, uint32_t lwe_dimension,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
-    uint32_t lwe_chunk_size) {
-  // We use shared memory for the polynomials that are used often during the
-  // bootstrap, since shared memory is kept in L1 cache and accessing it is
-  // much faster than global memory
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-
-  selected_memory = sharedmem;
-  double2 *accumulator_fft = (double2 *)selected_memory;
-
-  double2 *keybundle = keybundle_array +
-                       // select the input
-                       blockIdx.x * lwe_chunk_size * level_count *
-                           (glwe_dimension + 1) * (glwe_dimension + 1) *
-                           (polynomial_size / 2);
-
-  double2 *global_accumulator_fft_input =
-      global_accumulator_fft +
-      blockIdx.x * level_count * (glwe_dimension + 1) * (polynomial_size / 2);
-
-  for (int level = 0; level < level_count; level++) {
-    double2 *global_fft_slice =
-        global_accumulator_fft_input +
-        level * (glwe_dimension + 1) * (polynomial_size / 2);
-
-    for (int j = 0; j < (glwe_dimension + 1); j++) {
-      double2 *fft = global_fft_slice + j * params::degree / 2;
-
-      // Get the bootstrapping key piece necessary for the multiplication
-      // It is already in the Fourier domain
-      auto bsk_slice =
-          get_ith_mask_kth_block(keybundle, iteration, j, level,
-                                 polynomial_size, glwe_dimension, level_count);
-      auto bsk_poly = bsk_slice + blockIdx.y * params::degree / 2;
-
-      polynomial_product_accumulate_in_fourier_domain<params, double2>(
-          accumulator_fft, fft, bsk_poly, !level && !j);
-    }
-  }
-
-  // Perform the inverse FFT on the result of the GGSW x GLWE and add to the
-  // accumulator
-  NSMFFT_inverse<HalfDegree<params>>(accumulator_fft);
-  Torus *global_slice =
-      global_accumulator +
-      (blockIdx.y + blockIdx.x * (glwe_dimension + 1)) * params::degree;
-
-  add_to_torus<Torus, params>(accumulator_fft, global_slice, true);
-  synchronize_threads_in_block();
-
-  uint32_t lwe_iteration = iteration + lwe_offset;
-  if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) {
-    // Last iteration
-    auto block_lwe_array_out =
-        &lwe_array_out[blockIdx.x * (glwe_dimension * polynomial_size + 1) +
-                       blockIdx.y * polynomial_size];
-
-    if (blockIdx.y < glwe_dimension) {
-      // Perform a sample extract. At this point, all blocks have the result,
-      // but we do the computation at block 0 to avoid waiting for extra blocks,
-      // in case they're not synchronized
-      sample_extract_mask<Torus, params>(block_lwe_array_out, global_slice);
-    } else if (blockIdx.y == glwe_dimension) {
-      sample_extract_body<Torus, params>(block_lwe_array_out, global_slice, 0);
-    }
-  }
-}
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_bootstrap_keybundle(uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size; // accumulator
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_bootstrap_step_one(uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size * 2; // accumulator
-}
-template <typename Torus>
-__host__ __device__ uint64_t
-get_buffer_size_full_sm_multibit_bootstrap_step_two(uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size; // accumulator
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_multibit_bootstrap(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size) {
-
-  uint64_t buffer_size = 0;
-  buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
-                 (glwe_dimension + 1) * (glwe_dimension + 1) *
-                 (polynomial_size / 2) * sizeof(double2); // keybundle fft
-  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
-                 level_count * (polynomial_size / 2) *
-                 sizeof(double2); // global_accumulator_fft
-  buffer_size += input_lwe_ciphertext_count * (glwe_dimension + 1) *
-                 polynomial_size * sizeof(Torus); // global_accumulator
-
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void
-scratch_multi_bit_pbs(void *v_stream, uint32_t gpu_index, int8_t **pbs_buffer,
-                      uint32_t lwe_dimension, uint32_t glwe_dimension,
-                      uint32_t polynomial_size, uint32_t level_count,
-                      uint32_t input_lwe_ciphertext_count,
-                      uint32_t grouping_factor, uint32_t max_shared_memory,
-                      bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
-
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
-          polynomial_size);
-  uint64_t full_sm_accumulate_step_one =
-      get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
-          polynomial_size);
-  uint64_t full_sm_accumulate_step_two =
-      get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
-          polynomial_size);
-
-  check_cuda_error(cudaFuncSetAttribute(
-      device_multi_bit_bootstrap_keybundle<Torus, params>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle));
-  cudaFuncSetCacheConfig(device_multi_bit_bootstrap_keybundle<Torus, params>,
-                         cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
-
-  check_cuda_error(cudaFuncSetAttribute(
-      device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      full_sm_accumulate_step_one));
-  cudaFuncSetCacheConfig(
-      device_multi_bit_bootstrap_accumulate_step_one<Torus, params>,
-      cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
-
-  check_cuda_error(cudaFuncSetAttribute(
-      device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
-      cudaFuncAttributeMaxDynamicSharedMemorySize,
-      full_sm_accumulate_step_two));
-  cudaFuncSetCacheConfig(
-      device_multi_bit_bootstrap_accumulate_step_two<Torus, params>,
-      cudaFuncCachePreferShared);
-  check_cuda_error(cudaGetLastError());
-
-  if (allocate_gpu_memory) {
-    if (!lwe_chunk_size)
-      lwe_chunk_size =
-          get_lwe_chunk_size(lwe_dimension, level_count, glwe_dimension,
-                             input_lwe_ciphertext_count);
-
-    uint64_t buffer_size = get_buffer_size_multibit_bootstrap<Torus>(
-        glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, lwe_chunk_size);
-    *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-    check_cuda_error(cudaGetLastError());
-  }
-}
-
-template <typename Torus, typename STorus, class params>
-__host__ void host_multi_bit_pbs(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, uint64_t *bootstrapping_key,
-    int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_lut_vectors,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  // If a chunk size is not passed to this function, select one.
-  if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size(lwe_dimension, level_count,
-                                        glwe_dimension, num_samples);
-
-  //
-  double2 *keybundle_fft = (double2 *)pbs_buffer;
-  double2 *global_accumulator_fft =
-      (double2 *)keybundle_fft +
-      num_samples * lwe_chunk_size * level_count * (glwe_dimension + 1) *
-          (glwe_dimension + 1) * (polynomial_size / 2);
-  Torus *global_accumulator =
-      (Torus *)global_accumulator_fft +
-      (ptrdiff_t)(sizeof(double2) * num_samples * (glwe_dimension + 1) *
-                  level_count * (polynomial_size / 2) / sizeof(Torus));
-
-  //
-  uint64_t full_sm_keybundle =
-      get_buffer_size_full_sm_multibit_bootstrap_keybundle<Torus>(
-          polynomial_size);
-  uint64_t full_sm_accumulate_step_one =
-      get_buffer_size_full_sm_multibit_bootstrap_step_one<Torus>(
-          polynomial_size);
-  uint64_t full_sm_accumulate_step_two =
-      get_buffer_size_full_sm_multibit_bootstrap_step_two<Torus>(
-          polynomial_size);
-
-  uint32_t keybundle_size_per_input =
-      lwe_chunk_size * level_count * (glwe_dimension + 1) *
-      (glwe_dimension + 1) * (polynomial_size / 2);
-
-  //
-  dim3 grid_accumulate_step_one(level_count, glwe_dimension + 1, num_samples);
-  dim3 grid_accumulate_step_two(num_samples, glwe_dimension + 1);
-  dim3 thds(polynomial_size / params::opt, 1, 1);
-
-  for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
-       lwe_offset += lwe_chunk_size) {
-
-    uint32_t chunk_size = std::min(
-        lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
-
-    // Compute a keybundle
-    dim3 grid_keybundle(num_samples * chunk_size,
-                        (glwe_dimension + 1) * (glwe_dimension + 1),
-                        level_count);
-    device_multi_bit_bootstrap_keybundle<Torus, params>
-        <<<grid_keybundle, thds, full_sm_keybundle, *stream>>>(
-            lwe_array_in, keybundle_fft, bootstrapping_key, lwe_dimension,
-            glwe_dimension, polynomial_size, grouping_factor, base_log,
-            level_count, lwe_offset, chunk_size, keybundle_size_per_input);
-    check_cuda_error(cudaGetLastError());
-
-    // Accumulate
-    for (int j = 0; j < chunk_size; j++) {
-      device_multi_bit_bootstrap_accumulate_step_one<Torus, params>
-          <<<grid_accumulate_step_one, thds, full_sm_accumulate_step_one,
-             *stream>>>(lwe_array_in, lut_vector, lut_vector_indexes,
-                        global_accumulator, global_accumulator_fft,
-                        lwe_dimension, glwe_dimension, polynomial_size,
-                        base_log, level_count, j + lwe_offset);
-      check_cuda_error(cudaGetLastError());
-
-      device_multi_bit_bootstrap_accumulate_step_two<Torus, params>
-          <<<grid_accumulate_step_two, thds, full_sm_accumulate_step_two,
-             *stream>>>(lwe_array_out, keybundle_fft, global_accumulator,
-                        global_accumulator_fft, lwe_dimension, glwe_dimension,
-                        polynomial_size, level_count, grouping_factor, j,
-                        lwe_offset, lwe_chunk_size);
-      check_cuda_error(cudaGetLastError());
-    }
-  }
-}
-#endif // MULTIBIT_PBS_H
--- a/backends/concrete-cuda/implementation/src/ciphertext.cu
+++ b/backends/concrete-cuda/implementation/src/ciphertext.cu
@@ -1 +0,0 @@
-#include "crypto/ciphertext.cuh"
--- a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu
+++ b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cu
@@ -1,329 +0,0 @@
-#include "circuit_bootstrap.cuh"
-#include "circuit_bootstrap.h"
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_fast_circuit_bootstrap(int polynomial_size) {
-
-  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-}
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_circuit_bootstrap(int glwe_dimension, int polynomial_size,
-                              int level_bsk, int number_of_inputs) {
-  // The number of samples should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 4 * "
-          "(k + 1) * level_count_bsk",
-          number_of_inputs <=
-              number_of_sm / 4. / (glwe_dimension + 1) / level_bsk));
-
-  checks_fast_circuit_bootstrap(polynomial_size);
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the circuit bootstrap on 32 bits inputs, into `cbs_buffer`. It also
- * configures SM options on the GPU in case FULLSM mode is going to be used.
- */
-void scratch_cuda_circuit_bootstrap_32(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_bsk, uint32_t level_count_cbs, uint32_t number_of_inputs,
-    uint32_t max_shared_memory, bool allocate_gpu_memory) {
-
-  checks_fast_circuit_bootstrap(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<256>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<512>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<1024>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<2048>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<4096>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_circuit_bootstrap<uint32_t, int32_t, Degree<8192>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the circuit bootstrap on 64 bits inputs, into `cbs_buffer`. It also
- * configures SM options on the GPU in case FULLSM mode is going to be used.
- */
-void scratch_cuda_circuit_bootstrap_64(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_bsk, uint32_t level_count_cbs, uint32_t number_of_inputs,
-    uint32_t max_shared_memory, bool allocate_gpu_memory) {
-
-  checks_fast_circuit_bootstrap(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_circuit_bootstrap<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, cbs_buffer, glwe_dimension, lwe_dimension,
-        polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-        max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Perform circuit bootstrapping for the batch of 32 bit LWE ciphertexts.
- * Head out to the equivalent operation on 64 bits for more details.
- */
-void cuda_circuit_bootstrap_32(
-    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
-    void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
-    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
-    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
-    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
-    uint32_t max_shared_memory) {
-
-  checks_circuit_bootstrap(glwe_dimension, polynomial_size, level_bsk,
-                           number_of_inputs);
-
-  switch (polynomial_size) {
-  case 256:
-    host_circuit_bootstrap<uint32_t, Degree<256>>(
-        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 512:
-    host_circuit_bootstrap<uint32_t, Degree<512>>(
-        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 1024:
-    host_circuit_bootstrap<uint32_t, Degree<1024>>(
-        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 2048:
-    host_circuit_bootstrap<uint32_t, Degree<2048>>(
-        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 4096:
-    host_circuit_bootstrap<uint32_t, Degree<4096>>(
-        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 8192:
-    host_circuit_bootstrap<uint32_t, Degree<8192>>(
-        v_stream, gpu_index, (uint32_t *)ggsw_out, (uint32_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint32_t *)fp_ksk_array,
-        (uint32_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Perform circuit bootstrapping on a batch of 64 bit input LWE ciphertexts.
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - 'ggsw_out' output batch of ggsw with size:
- * 'number_of_inputs' * 'level_cbs' * ('glwe_dimension' + 1)^2 *
- * polynomial_size * sizeof(u64)
- *  - 'lwe_array_in' input batch of lwe ciphertexts, with size:
- * 'number_of_inputs' * '(lwe_dimension' + 1) * sizeof(u64)
- *  - 'fourier_bsk' bootstrapping key in fourier domain with size:
- * 'lwe_dimension' * 'level_bsk' * ('glwe_dimension' + 1)^2 *
- * 'polynomial_size' / 2 * sizeof(double2)
- *  - 'fp_ksk_array' batch of fp-keyswitch keys with size:
- * ('polynomial_size' + 1) * 'level_pksk' * ('glwe_dimension' + 1)^2 *
- * 'polynomial_size' * sizeof(u64)
- *  - 'cbs_buffer': buffer used during calculations, it is not an actual
- *  inputs of the function, just allocated memory for calculation
- *  process, like this, memory can be allocated once and can be used as much
- *  as needed for different calls of circuit_bootstrap function
- *
- * This function calls a wrapper to a device kernel that performs the
- * circuit bootstrap. The kernel is templatized based on integer discretization
- * and polynomial degree.
- */
-void cuda_circuit_bootstrap_64(
-    void *v_stream, uint32_t gpu_index, void *ggsw_out, void *lwe_array_in,
-    void *fourier_bsk, void *fp_ksk_array, void *lut_vector_indexes,
-    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
-    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
-    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
-    uint32_t max_shared_memory) {
-
-  checks_circuit_bootstrap(glwe_dimension, polynomial_size, level_bsk,
-                           number_of_inputs);
-
-  switch (polynomial_size) {
-  case 256:
-    host_circuit_bootstrap<uint64_t, Degree<256>>(
-        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 512:
-    host_circuit_bootstrap<uint64_t, Degree<512>>(
-        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 1024:
-    host_circuit_bootstrap<uint64_t, Degree<1024>>(
-        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 2048:
-    host_circuit_bootstrap<uint64_t, Degree<2048>>(
-        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 4096:
-    host_circuit_bootstrap<uint64_t, Degree<4096>>(
-        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  case 8192:
-    host_circuit_bootstrap<uint64_t, Degree<8192>>(
-        v_stream, gpu_index, (uint64_t *)ggsw_out, (uint64_t *)lwe_array_in,
-        (double2 *)fourier_bsk, (uint64_t *)fp_ksk_array,
-        (uint64_t *)lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, level_bsk, base_log_bsk, level_pksk,
-        base_log_pksk, level_cbs, base_log_cbs, number_of_inputs,
-        max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This cleanup function frees the data for the circuit bootstrap on GPU in
- * cbs_buffer for 32 or 64 bits inputs.
- */
-void cleanup_cuda_circuit_bootstrap(void *v_stream, uint32_t gpu_index,
-                                    int8_t **cbs_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*cbs_buffer, stream, gpu_index);
-}
--- a/backends/concrete-cuda/implementation/src/circuit_bootstrap.cuh
+++ b/backends/concrete-cuda/implementation/src/circuit_bootstrap.cuh
@@ -1,239 +0,0 @@
-#ifndef CBS_CUH
-#define CBS_CUH
-
-#include "bit_extraction.cuh"
-#include "bootstrap_low_latency.cuh"
-#include "device.h"
-#include "keyswitch.cuh"
-#include "polynomial/parameters.cuh"
-#include "utils/timer.cuh"
-
-/*
- * scalar multiplication to value for batch of lwe_ciphertext
- * works for any size of lwe input
- * blockIdx.x refers to input ciphertext it
- */
-template <typename Torus, class params>
-__global__ void shift_lwe_cbs(Torus *dst_shift, Torus *src, Torus value,
-                              size_t lwe_size) {
-
-  size_t blockId = blockIdx.y * gridDim.x + blockIdx.x;
-  size_t threads_per_block = blockDim.x;
-  size_t opt = lwe_size / threads_per_block;
-  size_t rem = lwe_size & (threads_per_block - 1);
-
-  auto cur_dst = &dst_shift[blockId * lwe_size];
-  auto cur_src = &src[blockIdx.y * lwe_size];
-
-  size_t tid = threadIdx.x;
-  for (size_t i = 0; i < opt; i++) {
-    cur_dst[tid] = cur_src[tid] * value;
-    tid += threads_per_block;
-  }
-
-  if (threadIdx.x < rem)
-    cur_dst[tid] = cur_src[tid] * value;
-}
-
-/*
- * Fill lut, equivalent to trivial encryption as mask is 0s.
- * The LUT is filled with -alpha in each coefficient where
- * alpha = 2^{log(q) - 1 - base_log * level}
- * blockIdx.x refers to lut id
- * value is not passed and calculated inside function because lut id is one
- * of the variable.
- */
-template <typename Torus, class params>
-__global__ void fill_lut_body_for_cbs(Torus *lut, uint32_t ciphertext_n_bits,
-                                      uint32_t base_log_cbs,
-                                      uint32_t glwe_dimension) {
-
-  Torus *cur_body = &lut[(blockIdx.x * (glwe_dimension + 1) + glwe_dimension) *
-                         params::degree];
-  size_t tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    cur_body[tid] =
-        0ll -
-        (1ll << (ciphertext_n_bits - 1 - base_log_cbs * (blockIdx.x + 1)));
-    tid += params::degree / params::opt;
-  }
-}
-
-/*
- * copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
- * each of the input ciphertext from lwe_src is  copied (glwe_dimension + 1)
- * times inside lwe_dst, and then value is added to the body.
- * blockIdx.x refers to destination lwe ciphertext id: 'dst_lwe_id'
- * 'src_lwe_id' = 'dst_lwe_id' / (glwe_dimension + 1)
- *
- * example: glwe_dimension = 1
- *                 src_0         ...          src_n
- *                  / \                        / \
- *                 /   \                      /   \
- *             dst_0  dst_1               dst_2n  dst_2n+1
- */
-template <typename Torus, class params>
-__global__ void copy_add_lwe_cbs(Torus *lwe_dst, Torus *lwe_src,
-                                 uint32_t ciphertext_n_bits,
-                                 uint32_t base_log_cbs, uint32_t level_cbs,
-                                 uint32_t glwe_dimension) {
-  size_t tid = threadIdx.x;
-  size_t src_lwe_id = blockIdx.x / (glwe_dimension + 1);
-  size_t dst_lwe_id = blockIdx.x;
-  size_t cur_cbs_level = src_lwe_id % level_cbs + 1;
-
-  auto cur_src = &lwe_src[src_lwe_id * (glwe_dimension * params::degree + 1)];
-  auto cur_dst = &lwe_dst[dst_lwe_id * (glwe_dimension * params::degree + 1)];
-
-  auto cur_src_slice = cur_src + blockIdx.y * params::degree;
-  auto cur_dst_slice = cur_dst + blockIdx.y * params::degree;
-#pragma unroll
-  for (int i = 0; i < params::opt; i++) {
-    cur_dst_slice[tid] = cur_src_slice[tid];
-    tid += params::degree / params::opt;
-  }
-  Torus val = 1ll << (ciphertext_n_bits - 1 - base_log_cbs * cur_cbs_level);
-  if (threadIdx.x == 0 && blockIdx.y == 0) {
-    cur_dst[glwe_dimension * params::degree] =
-        cur_src[glwe_dimension * params::degree] + val;
-  }
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_cbs(uint32_t glwe_dimension,
-                                                 uint32_t lwe_dimension,
-                                                 uint32_t polynomial_size,
-                                                 uint32_t level_count_cbs,
-                                                 uint32_t number_of_inputs) {
-
-  uint64_t buffer_size =
-      number_of_inputs * level_count_cbs * (glwe_dimension + 1) *
-          (glwe_dimension * polynomial_size + 1) *
-          sizeof(Torus) + // lwe_array_in_fp_ks_buffer
-      number_of_inputs * level_count_cbs *
-          (glwe_dimension * polynomial_size + 1) *
-          sizeof(Torus) + // lwe_array_out_pbs_buffer
-      number_of_inputs * level_count_cbs * (lwe_dimension + 1) *
-          sizeof(Torus) + // lwe_array_in_shifted_buffer
-      level_count_cbs * (glwe_dimension + 1) * polynomial_size *
-          sizeof(Torus); // lut_vector_cbs
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void scratch_circuit_bootstrap(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_buffer,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t level_bsk, uint32_t level_count_cbs, uint32_t number_of_inputs,
-    uint32_t max_shared_memory, bool allocate_gpu_memory) {
-
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  int pbs_count = number_of_inputs * level_count_cbs;
-  // allocate and initialize device pointers for circuit bootstrap
-  if (allocate_gpu_memory) {
-    uint64_t buffer_size = get_buffer_size_cbs<Torus>(
-                               glwe_dimension, lwe_dimension, polynomial_size,
-                               level_count_cbs, number_of_inputs) +
-                           get_buffer_size_bootstrap_low_latency<Torus>(
-                               glwe_dimension, polynomial_size, level_bsk,
-                               pbs_count, max_shared_memory);
-    *cbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-  }
-
-  scratch_bootstrap_low_latency<Torus, STorus, params>(
-      v_stream, gpu_index, cbs_buffer, glwe_dimension, polynomial_size,
-      level_bsk, pbs_count, max_shared_memory, false);
-}
-
-/*
- * Host function for cuda circuit bootstrap.
- * It executes device functions in specific order and manages
- * parallelism
- */
-template <typename Torus, class params>
-__host__ void host_circuit_bootstrap(
-    void *v_stream, uint32_t gpu_index, Torus *ggsw_out, Torus *lwe_array_in,
-    double2 *fourier_bsk, Torus *fp_ksk_array, Torus *lut_vector_indexes,
-    int8_t *cbs_buffer, uint32_t delta_log, uint32_t polynomial_size,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t level_bsk,
-    uint32_t base_log_bsk, uint32_t level_pksk, uint32_t base_log_pksk,
-    uint32_t level_cbs, uint32_t base_log_cbs, uint32_t number_of_inputs,
-    uint32_t max_shared_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint32_t ciphertext_n_bits = sizeof(Torus) * 8;
-  uint32_t lwe_size = lwe_dimension + 1;
-  int pbs_count = number_of_inputs * level_cbs;
-
-  dim3 blocks(level_cbs, number_of_inputs, 1);
-  int threads = 256;
-
-  // Always define the PBS buffer first, because it has the strongest memory
-  // alignment requirement (16 bytes for double2)
-  int8_t *pbs_buffer = (int8_t *)cbs_buffer;
-  Torus *lwe_array_out_pbs_buffer =
-      (Torus *)pbs_buffer +
-      (ptrdiff_t)(get_buffer_size_bootstrap_low_latency<Torus>(
-                      glwe_dimension, polynomial_size, level_bsk, pbs_count,
-                      max_shared_memory) /
-                  sizeof(Torus));
-  Torus *lwe_array_in_shifted_buffer =
-      lwe_array_out_pbs_buffer +
-      (ptrdiff_t)(number_of_inputs * level_cbs *
-                  (glwe_dimension * polynomial_size + 1));
-  Torus *lut_vector =
-      lwe_array_in_shifted_buffer +
-      (ptrdiff_t)(number_of_inputs * level_cbs * (lwe_dimension + 1));
-  Torus *lwe_array_in_fp_ks_buffer =
-      lut_vector +
-      (ptrdiff_t)(level_cbs * (glwe_dimension + 1) * polynomial_size);
-
-  // Shift message LSB on padding bit, at this point we expect to have messages
-  // with only 1 bit of information
-  shift_lwe_cbs<Torus, params><<<blocks, threads, 0, *stream>>>(
-      lwe_array_in_shifted_buffer, lwe_array_in,
-      1LL << (ciphertext_n_bits - delta_log - 1), lwe_size);
-
-  // Add q/4 to center the error while computing a negacyclic LUT
-  add_to_body<Torus>
-      <<<pbs_count, 1, 0, *stream>>>(lwe_array_in_shifted_buffer, lwe_dimension,
-                                     1ll << (ciphertext_n_bits - 2));
-  // Fill lut (equivalent to trivial encryption as mask is 0s)
-  // The LUT is filled with -alpha in each coefficient where
-  // alpha = 2^{log(q) - 1 - base_log * level}
-  check_cuda_error(cudaMemsetAsync(lut_vector, 0,
-                                   level_cbs * (glwe_dimension + 1) *
-                                       polynomial_size * sizeof(Torus),
-                                   *stream));
-  fill_lut_body_for_cbs<Torus, params>
-      <<<level_cbs, params::degree / params::opt, 0, *stream>>>(
-          lut_vector, ciphertext_n_bits, base_log_cbs, glwe_dimension);
-
-  // Applying a negacyclic LUT on a ciphertext with one bit of message in the
-  // MSB and no bit of padding
-  host_bootstrap_low_latency<Torus, params>(
-      v_stream, gpu_index, lwe_array_out_pbs_buffer, lut_vector,
-      lut_vector_indexes, lwe_array_in_shifted_buffer, fourier_bsk, pbs_buffer,
-      glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk, level_bsk,
-      pbs_count, level_cbs, max_shared_memory);
-
-  dim3 copy_grid(pbs_count * (glwe_dimension + 1), glwe_dimension, 1);
-  dim3 copy_block(params::degree / params::opt, 1, 1);
-  // Add q/4 to center the error while computing a negacyclic LUT
-  // copy pbs result (glwe_dimension + 1) times to be an input of fp-ks
-  copy_add_lwe_cbs<Torus, params><<<copy_grid, copy_block, 0, *stream>>>(
-      lwe_array_in_fp_ks_buffer, lwe_array_out_pbs_buffer, ciphertext_n_bits,
-      base_log_cbs, level_cbs, glwe_dimension);
-
-  cuda_fp_keyswitch_lwe_to_glwe(
-      v_stream, gpu_index, ggsw_out, lwe_array_in_fp_ks_buffer, fp_ksk_array,
-      glwe_dimension * polynomial_size, glwe_dimension, polynomial_size,
-      base_log_pksk, level_pksk, pbs_count * (glwe_dimension + 1),
-      glwe_dimension + 1);
-}
-
-#endif // CBS_CUH
--- a/backends/concrete-cuda/implementation/src/complex/operations.cuh
+++ b/backends/concrete-cuda/implementation/src/complex/operations.cuh
@@ -1,138 +0,0 @@
-#ifndef GPU_BOOTSTRAP_COMMON_CUH
-#define GPU_BOOTSTRAP_COMMON_CUH
-
-#include <cassert>
-#include <cstdint>
-#include <cstdio>
-
-#define SNT 1
-#define dPI 6.283185307179586231995926937088
-
-using sTorus = int32_t;
-// using Torus = uint32_t;
-using sTorus = int32_t;
-using u32 = uint32_t;
-using i32 = int32_t;
-
-//--------------------------------------------------
-// Basic double2 operations
-
-__device__ inline double2 conjugate(const double2 num) {
-  double2 res;
-  res.x = num.x;
-  res.y = -num.y;
-  return res;
-}
-
-__device__ inline void operator+=(double2 &lh, const double2 rh) {
-  lh.x += rh.x;
-  lh.y += rh.y;
-}
-
-__device__ inline void operator-=(double2 &lh, const double2 rh) {
-  lh.x -= rh.x;
-  lh.y -= rh.y;
-}
-
-__device__ inline double2 operator+(const double2 a, const double2 b) {
-  double2 res;
-  res.x = a.x + b.x;
-  res.y = a.y + b.y;
-  return res;
-}
-
-__device__ inline double2 operator-(const double2 a, const double2 b) {
-  double2 res;
-  res.x = a.x - b.x;
-  res.y = a.y - b.y;
-  return res;
-}
-
-__device__ inline double2 operator*(const double2 a, const double2 b) {
-  double xx = a.x * b.x;
-  double xy = a.x * b.y;
-  double yx = a.y * b.x;
-  double yy = a.y * b.y;
-
-  double2 res;
-  // asm volatile("fma.rn.f64 %0, %1, %2, %3;": "=d"(res.x) : "d"(a.x),
-  // "d"(b.x), "d"(yy));
-  res.x = xx - yy;
-  res.y = xy + yx;
-  return res;
-}
-
-__device__ inline double2 operator*(const double2 a, double b) {
-  double2 res;
-  res.x = a.x * b;
-  res.y = a.y * b;
-  return res;
-}
-
-__device__ inline void operator*=(double2 &a, const double2 b) {
-  double tmp = a.x;
-  a.x *= b.x;
-  a.x -= a.y * b.y;
-  a.y *= b.x;
-  a.y += b.y * tmp;
-}
-
-__device__ inline double2 operator*(double a, double2 b) {
-  double2 res;
-  res.x = b.x * a;
-  res.y = b.y * a;
-  return res;
-}
-
-
-template <typename T> __global__ void print_debug_kernel(T *src, int N) {
-  for (int i = 0; i < N; i++) {
-    printf("%lu, ", src[i]);
-  }
-}
-
-template <typename T> void print_debug(const char *name, T *src, int N) {
-  printf("%s: ", name);
-  cudaDeviceSynchronize();
-  print_debug_kernel<<<1, 1>>>(src, N);
-  cudaDeviceSynchronize();
-  printf("\n");
-}
-
-
-template <typename Torus> struct int_mul_memory {
-  Torus *vector_result_sb;
-  Torus *block_mul_res;
-  Torus *small_lwe_vector;
-  Torus *lwe_pbs_out_array;
-  Torus *test_vector_array;
-  Torus *message_acc;
-  Torus *carry_acc;
-  Torus *test_vector_indexes;
-  Torus *tvi_message;
-  Torus *tvi_carry;
-  int8_t *pbs_buffer;
-
-  int p2p_gpu_count = 0;
-
-  cudaStream_t *streams[32];
-
-  int8_t *pbs_buffer_multi_gpu[32];
-  Torus *pbs_input_multi_gpu[32];
-  Torus *pbs_output_multi_gpu[32];
-  Torus *test_vector_multi_gpu[32];
-  Torus *tvi_lsb_multi_gpu[32];
-  Torus *tvi_msb_multi_gpu[32];
-  Torus *tvi_message_multi_gpu[32];
-  Torus *tvi_carry_multi_gpu[32];
-  Torus *bsk_multi_gpu[32];
-  Torus *ksk_multi_gpu[32];
-
-  Torus *device_to_device_buffer[8];
-
-  bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
-};
-
-
-#endif
-
--- a/backends/concrete-cuda/implementation/src/crypto/bootstrapping_key.cuh
+++ b/backends/concrete-cuda/implementation/src/crypto/bootstrapping_key.cuh
@@ -1,498 +0,0 @@
-#ifndef CNCRT_BSK_H
-#define CNCRT_BSK_H
-
-#include "bootstrap.h"
-#include "bootstrap_multibit.h"
-#include "device.h"
-#include "polynomial/parameters.cuh"
-#include "polynomial/polynomial.cuh"
-#include <atomic>
-#include <cstdint>
-#include <err.h>
-
-__device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
-                                         int glwe_dimension,
-                                         uint32_t level_count) {
-  return i * polynomial_size / 2 * (glwe_dimension + 1) * (glwe_dimension + 1) *
-         level_count;
-}
-
-////////////////////////////////////////////////
-template <typename T>
-__device__ T *get_ith_mask_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count) {
-  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
-                                 level_count) +
-              level * polynomial_size / 2 * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) +
-              k * polynomial_size / 2 * (glwe_dimension + 1)];
-}
-
-template <typename T>
-__device__ T *get_ith_body_kth_block(T *ptr, int i, int k, int level,
-                                     uint32_t polynomial_size,
-                                     int glwe_dimension, uint32_t level_count) {
-  return &ptr[get_start_ith_ggsw(i, polynomial_size, glwe_dimension,
-                                 level_count) +
-              level * polynomial_size / 2 * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) +
-              k * polynomial_size / 2 * (glwe_dimension + 1) +
-              glwe_dimension * polynomial_size / 2];
-}
-
-////////////////////////////////////////////////
-__device__ inline int get_start_ith_lwe(uint32_t i, uint32_t grouping_factor,
-                                        uint32_t polynomial_size,
-                                        uint32_t glwe_dimension,
-                                        uint32_t level_count) {
-  return i * (1 << grouping_factor) * polynomial_size / 2 *
-         (glwe_dimension + 1) * (glwe_dimension + 1) * level_count;
-}
-
-template <typename T>
-__device__ T *get_multi_bit_ith_lwe_gth_group_kth_block(
-    T *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count) {
-  T *ptr_group = ptr + get_start_ith_lwe(i, grouping_factor, polynomial_size,
-                                         glwe_dimension, level_count);
-  return get_ith_mask_kth_block(ptr_group, g, k, level, polynomial_size,
-                                glwe_dimension, level_count);
-}
-////////////////////////////////////////////////
-template <typename T, typename ST>
-void cuda_convert_lwe_bootstrap_key(double2 *dest, ST *src, void *v_stream,
-                                    uint32_t gpu_index, uint32_t input_lwe_dim,
-                                    uint32_t glwe_dim, uint32_t level_count,
-                                    uint32_t polynomial_size,
-                                    uint32_t total_polynomials) {
-
-  cudaSetDevice(gpu_index);
-  int shared_memory_size = sizeof(double) * polynomial_size;
-
-  // Here the buffer size is the size of double2 times the number of polynomials
-  // times the polynomial size over 2 because the polynomials are compressed
-  // into the complex domain to perform the FFT
-  size_t buffer_size =
-      total_polynomials * polynomial_size / 2 * sizeof(double2);
-
-  int gridSize = total_polynomials;
-  int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
-
-  double2 *h_bsk = (double2 *)malloc(buffer_size);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  double2 *d_bsk = (double2 *)cuda_malloc_async(buffer_size, stream, gpu_index);
-
-  // compress real bsk to complex and divide it on DOUBLE_MAX
-  for (int i = 0; i < total_polynomials; i++) {
-    int complex_current_poly_idx = i * polynomial_size / 2;
-    int torus_current_poly_idx = i * polynomial_size;
-    for (int j = 0; j < polynomial_size / 2; j++) {
-      h_bsk[complex_current_poly_idx + j].x = src[torus_current_poly_idx + j];
-      h_bsk[complex_current_poly_idx + j].y =
-          src[torus_current_poly_idx + j + polynomial_size / 2];
-      h_bsk[complex_current_poly_idx + j].x /=
-          (double)std::numeric_limits<T>::max();
-      h_bsk[complex_current_poly_idx + j].y /=
-          (double)std::numeric_limits<T>::max();
-    }
-  }
-
-  cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
-
-  double2 *buffer;
-  switch (polynomial_size) {
-  case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
-                                                                 buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
-    }
-    break;
-  case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
-                                                                 buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
-    }
-    break;
-  case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
-                                                                 buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
-    }
-    break;
-  case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
-                                                                 buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
-    }
-    break;
-  case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
-                                                                 buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
-    }
-    break;
-  case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
-                                                                 buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
-    }
-    break;
-  case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(d_bsk, dest,
-                                                                 buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(d_bsk, dest, buffer);
-    }
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-
-  cuda_drop_async(d_bsk, stream, gpu_index);
-  cuda_drop_async(buffer, stream, gpu_index);
-  free(h_bsk);
-}
-
-void cuda_convert_lwe_bootstrap_key_32(void *dest, void *src, void *v_stream,
-                                       uint32_t gpu_index,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size) {
-  uint32_t total_polynomials =
-      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
-  cuda_convert_lwe_bootstrap_key<uint32_t, int32_t>(
-      (double2 *)dest, (int32_t *)src, v_stream, gpu_index, input_lwe_dim,
-      glwe_dim, level_count, polynomial_size, total_polynomials);
-}
-
-void cuda_convert_lwe_bootstrap_key_64(void *dest, void *src, void *v_stream,
-                                       uint32_t gpu_index,
-                                       uint32_t input_lwe_dim,
-                                       uint32_t glwe_dim, uint32_t level_count,
-                                       uint32_t polynomial_size) {
-  uint32_t total_polynomials =
-      input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
-  cuda_convert_lwe_bootstrap_key<uint64_t, int64_t>(
-      (double2 *)dest, (int64_t *)src, v_stream, gpu_index, input_lwe_dim,
-      glwe_dim, level_count, polynomial_size, total_polynomials);
-}
-
-void cuda_convert_lwe_multi_bit_bootstrap_key_64(
-    void *dest, void *src, void *v_stream, uint32_t gpu_index,
-    uint32_t input_lwe_dim, uint32_t glwe_dim, uint32_t level_count,
-    uint32_t polynomial_size, uint32_t grouping_factor) {
-  uint32_t total_polynomials = input_lwe_dim * (glwe_dim + 1) * (glwe_dim + 1) *
-                               level_count * (1 << grouping_factor) /
-                               grouping_factor;
-  size_t buffer_size = total_polynomials * polynomial_size * sizeof(uint64_t);
-
-  cuda_memcpy_async_to_gpu((uint64_t *)dest, (uint64_t *)src, buffer_size,
-                           (cudaStream_t *)v_stream, gpu_index);
-}
-
-void cuda_fourier_polynomial_mul(void *_input1, void *_input2, void *_output,
-                                 void *v_stream, uint32_t gpu_index,
-                                 uint32_t polynomial_size,
-                                 uint32_t total_polynomials) {
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  auto input1 = (double2 *)_input1;
-  auto input2 = (double2 *)_input2;
-  auto output = (double2 *)_output;
-
-  size_t shared_memory_size = sizeof(double2) * polynomial_size / 2;
-
-  int gridSize = total_polynomials;
-  int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
-
-  double2 *buffer;
-  switch (polynomial_size) {
-  case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(
-              input1, input2, output, buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
-    }
-    break;
-  case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(
-              input1, input2, output, buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
-    }
-    break;
-  case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(
-              input1, input2, output, buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
-    }
-    break;
-  case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(
-              input1, input2, output, buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
-    }
-    break;
-  case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(
-              input1, input2, output, buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
-    }
-    break;
-  case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(
-              input1, input2, output, buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
-    }
-    break;
-  case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
-                           FULLSM>
-          <<<gridSize, blockSize, shared_memory_size, *stream>>>(
-              input1, input2, output, buffer);
-    } else {
-      buffer = (double2 *)cuda_malloc_async(
-          shared_memory_size * total_polynomials, stream, gpu_index);
-      batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, NOSM>
-          <<<gridSize, blockSize, 0, *stream>>>(input1, input2, output, buffer);
-    }
-    break;
-  default:
-    errx(EXIT_FAILURE, "polynomial size %u is not supported. Supported values "
-	 "are: 256, 512, 1024, 2048, 4096, 8192, 16384.", polynomial_size);
-    break;
-  }
-  cuda_drop_async(buffer, stream, gpu_index);
-}
-
-// We need these lines so the compiler knows how to specialize these functions
-template __device__ uint64_t *get_ith_mask_kth_block(uint64_t *ptr, int i,
-                                                     int k, int level,
-                                                     uint32_t polynomial_size,
-                                                     int glwe_dimension,
-                                                     uint32_t level_count);
-template __device__ uint32_t *get_ith_mask_kth_block(uint32_t *ptr, int i,
-                                                     int k, int level,
-                                                     uint32_t polynomial_size,
-                                                     int glwe_dimension,
-                                                     uint32_t level_count);
-template __device__ double2 *get_ith_mask_kth_block(double2 *ptr, int i, int k,
-                                                    int level,
-                                                    uint32_t polynomial_size,
-                                                    int glwe_dimension,
-                                                    uint32_t level_count);
-template __device__ uint64_t *get_ith_body_kth_block(uint64_t *ptr, int i,
-                                                     int k, int level,
-                                                     uint32_t polynomial_size,
-                                                     int glwe_dimension,
-                                                     uint32_t level_count);
-template __device__ uint32_t *get_ith_body_kth_block(uint32_t *ptr, int i,
-                                                     int k, int level,
-                                                     uint32_t polynomial_size,
-                                                     int glwe_dimension,
-                                                     uint32_t level_count);
-template __device__ double2 *get_ith_body_kth_block(double2 *ptr, int i, int k,
-                                                    int level,
-                                                    uint32_t polynomial_size,
-                                                    int glwe_dimension,
-                                                    uint32_t level_count);
-
-template __device__ uint64_t *get_multi_bit_ith_lwe_gth_group_kth_block(
-    uint64_t *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
-
-template __device__ double2 *get_multi_bit_ith_lwe_gth_group_kth_block(
-    double2 *ptr, int g, int i, int k, int level, uint32_t grouping_factor,
-    uint32_t polynomial_size, uint32_t glwe_dimension, uint32_t level_count);
-#endif // CNCRT_BSK_H
--- a/backends/concrete-cuda/implementation/src/crypto/ciphertext.cuh
+++ b/backends/concrete-cuda/implementation/src/crypto/ciphertext.cuh
@@ -1,50 +0,0 @@
-#ifndef CIPHERTEXT_H
-#define CIPHERTEXT_H
-
-#include "ciphertext.h"
-#include "device.h"
-#include <cstdint>
-
-template <typename T>
-void cuda_convert_lwe_ciphertext_vector_to_gpu(T *dest, T *src, void *v_stream,
-                                               uint32_t gpu_index,
-                                               uint32_t number_of_cts,
-                                               uint32_t lwe_dimension) {
-  cudaSetDevice(gpu_index);
-  cudaStream_t *stream = static_cast<cudaStream_t *>(v_stream);
-  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
-  cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
-}
-
-void cuda_convert_lwe_ciphertext_vector_to_gpu_64(void *dest, void *src,
-                                                  void *v_stream,
-                                                  uint32_t gpu_index,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension) {
-  cuda_convert_lwe_ciphertext_vector_to_gpu<uint64_t>(
-      (uint64_t *)dest, (uint64_t *)src, v_stream, gpu_index, number_of_cts,
-      lwe_dimension);
-}
-
-template <typename T>
-void cuda_convert_lwe_ciphertext_vector_to_cpu(T *dest, T *src, void *v_stream,
-                                               uint32_t gpu_index,
-                                               uint32_t number_of_cts,
-                                               uint32_t lwe_dimension) {
-  cudaSetDevice(gpu_index);
-  cudaStream_t *stream = static_cast<cudaStream_t *>(v_stream);
-  uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
-  cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
-}
-
-void cuda_convert_lwe_ciphertext_vector_to_cpu_64(void *dest, void *src,
-                                                  void *v_stream,
-                                                  uint32_t gpu_index,
-                                                  uint32_t number_of_cts,
-                                                  uint32_t lwe_dimension) {
-  cuda_convert_lwe_ciphertext_vector_to_cpu<uint64_t>(
-      (uint64_t *)dest, (uint64_t *)src, v_stream, gpu_index, number_of_cts,
-      lwe_dimension);
-}
-
-#endif
--- a/backends/concrete-cuda/implementation/src/crypto/gadget.cuh
+++ b/backends/concrete-cuda/implementation/src/crypto/gadget.cuh
@@ -1,131 +0,0 @@
-#ifndef CNCRT_CRYPTO_H
-#define CNCRT_CRPYTO_H
-
-#include "polynomial/polynomial.cuh"
-#include <cstdint>
-
-/**
- * GadgetMatrix implements the iterator design pattern to decompose a set of
- * num_poly consecutive polynomials with degree params::degree. A total of
- * level_count levels is expected and each call to decompose_and_compress_next()
- * writes to the result the next level. It is also possible to advance an
- * arbitrary amount of levels by using decompose_and_compress_level().
- *
- * This class always decomposes the entire set of num_poly polynomials.
- * By default, it works on a single polynomial.
- */
-#pragma once
-template <typename T, class params> class GadgetMatrix {
-private:
-  uint32_t level_count;
-  uint32_t base_log;
-  uint32_t mask;
-  uint32_t halfbg;
-  uint32_t num_poly;
-  T offset;
-  int current_level;
-  T mask_mod_b;
-  T *state;
-
-public:
-  __device__ GadgetMatrix(uint32_t base_log, uint32_t level_count, T *state,
-                          uint32_t num_poly = 1)
-      : base_log(base_log), level_count(level_count), num_poly(num_poly),
-        state(state) {
-
-    mask_mod_b = (1ll << base_log) - 1ll;
-    current_level = level_count;
-    int tid = threadIdx.x;
-    for (int i = 0; i < num_poly * params::opt; i++) {
-      state[tid] >>= (sizeof(T) * 8 - base_log * level_count);
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
-  // Decomposes all polynomials at once
-  __device__ void decompose_and_compress_next(double2 *result) {
-    for (int j = 0; j < num_poly; j++) {
-      auto result_slice = result + j * params::degree / 2;
-      decompose_and_compress_next_polynomial(result_slice, j);
-    }
-  }
-
-  // Decomposes a single polynomial
-  __device__ void decompose_and_compress_next_polynomial(double2 *result,
-                                                         int j) {
-    if (j == 0)
-      current_level -= 1;
-
-    int tid = threadIdx.x;
-    auto state_slice = state + j * params::degree;
-    for (int i = 0; i < params::opt / 2; i++) {
-      T res_re = state_slice[tid] & mask_mod_b;
-      T res_im = state_slice[tid + params::degree / 2] & mask_mod_b;
-      state_slice[tid] >>= base_log;
-      state_slice[tid + params::degree / 2] >>= base_log;
-      T carry_re = ((res_re - 1ll) | state_slice[tid]) & res_re;
-      T carry_im =
-          ((res_im - 1ll) | state_slice[tid + params::degree / 2]) & res_im;
-      carry_re >>= (base_log - 1);
-      carry_im >>= (base_log - 1);
-      state_slice[tid] += carry_re;
-      state_slice[tid + params::degree / 2] += carry_im;
-      res_re -= carry_re << base_log;
-      res_im -= carry_im << base_log;
-
-      result[tid].x = (int32_t)res_re;
-      result[tid].y = (int32_t)res_im;
-
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
-  __device__ void decompose_and_compress_level(double2 *result, int level) {
-    for (int i = 0; i < level_count - level; i++)
-      decompose_and_compress_next(result);
-  }
-};
-
-template <typename T> class GadgetMatrixSingle {
-private:
-  uint32_t level_count;
-  uint32_t base_log;
-  uint32_t mask;
-  uint32_t halfbg;
-  T offset;
-
-public:
-  __device__ GadgetMatrixSingle(uint32_t base_log, uint32_t level_count)
-      : base_log(base_log), level_count(level_count) {
-    uint32_t bg = 1 << base_log;
-    this->halfbg = bg / 2;
-    this->mask = bg - 1;
-    T temp = 0;
-    for (int i = 0; i < this->level_count; i++) {
-      temp += 1ULL << (sizeof(T) * 8 - (i + 1) * this->base_log);
-    }
-    this->offset = temp * this->halfbg;
-  }
-
-  __device__ T decompose_one_level_single(T element, uint32_t level) {
-    T s = element + this->offset;
-    uint32_t decal = (sizeof(T) * 8 - (level + 1) * this->base_log);
-    T temp1 = (s >> decal) & this->mask;
-    return (T)(temp1 - this->halfbg);
-  }
-};
-
-template <typename Torus>
-__device__ Torus decompose_one(Torus &state, Torus mask_mod_b, int base_log) {
-  Torus res = state & mask_mod_b;
-  state >>= base_log;
-  Torus carry = ((res - 1ll) | state) & res;
-  carry >>= base_log - 1;
-  state += carry;
-  res -= carry << base_log;
-  return res;
-}
-
-#endif // CNCRT_CRPYTO_H
--- a/backends/concrete-cuda/implementation/src/crypto/ggsw.cuh
+++ b/backends/concrete-cuda/implementation/src/crypto/ggsw.cuh
@@ -1,73 +0,0 @@
-#ifndef CNCRT_GGSW_CUH
-#define CNCRT_GGSW_CUH
-
-#include "device.h"
-#include "polynomial/parameters.cuh"
-
-template <typename T, typename ST, class params, sharedMemDegree SMD>
-__global__ void device_batch_fft_ggsw_vector(double2 *dest, T *src,
-                                             int8_t *device_mem) {
-
-  extern __shared__ int8_t sharedmem[];
-  double2 *selected_memory;
-
-  if constexpr (SMD == FULLSM)
-    selected_memory = (double2 *)sharedmem;
-  else
-    selected_memory = (double2 *)device_mem[blockIdx.x * params::degree];
-
-  // Compression
-  int offset = blockIdx.x * blockDim.x;
-  int tid = threadIdx.x;
-  int log_2_opt = params::opt >> 1;
-
-#pragma unroll
-  for (int i = 0; i < log_2_opt; i++) {
-    ST x = src[(tid) + params::opt * offset];
-    ST y = src[(tid + params::degree / 2) + params::opt * offset];
-    selected_memory[tid].x = x / (double)std::numeric_limits<T>::max();
-    selected_memory[tid].y = y / (double)std::numeric_limits<T>::max();
-    tid += params::degree / params::opt;
-  }
-  synchronize_threads_in_block();
-
-  // Switch to the FFT space
-  NSMFFT_direct<HalfDegree<params>>(selected_memory);
-  synchronize_threads_in_block();
-
-  // Write the output to global memory
-  tid = threadIdx.x;
-#pragma unroll
-  for (int j = 0; j < log_2_opt; j++) {
-    dest[tid + (params::opt >> 1) * offset] = selected_memory[tid];
-    tid += params::degree / params::opt;
-  }
-}
-
-/**
- * Applies the FFT transform on sequence of GGSW ciphertexts already in the
- * global memory
- */
-template <typename T, typename ST, class params>
-void batch_fft_ggsw_vector(cudaStream_t *stream, double2 *dest, T *src,
-                           int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
-                           uint32_t polynomial_size, uint32_t level_count,
-                           uint32_t gpu_index, uint32_t max_shared_memory) {
-
-  int shared_memory_size = sizeof(double) * polynomial_size;
-
-  int gridSize = r * (glwe_dim + 1) * (glwe_dim + 1) * level_count;
-  int blockSize = polynomial_size / params::opt;
-
-  if (max_shared_memory < shared_memory_size) {
-    device_batch_fft_ggsw_vector<T, ST, params, NOSM>
-        <<<gridSize, blockSize, 0, *stream>>>(dest, src, d_mem);
-  } else {
-    device_batch_fft_ggsw_vector<T, ST, params, FULLSM>
-        <<<gridSize, blockSize, shared_memory_size, *stream>>>(dest, src,
-                                                               d_mem);
-  }
-  check_cuda_error(cudaGetLastError());
-}
-
-#endif // CONCRETE_CORE_GGSW_CUH
--- a/backends/concrete-cuda/implementation/src/crypto/torus.cuh
+++ b/backends/concrete-cuda/implementation/src/crypto/torus.cuh
@@ -1,74 +0,0 @@
-#ifndef CNCRT_TORUS_H
-#define CNCRT_TORUS_H
-
-#include "types/int128.cuh"
-#include <limits>
-
-template <typename T>
-__device__ inline void typecast_double_to_torus(double x, T &r) {
-  r = T(x);
-}
-
-template <>
-__device__ inline void typecast_double_to_torus<uint32_t>(double x,
-                                                          uint32_t &r) {
-  r = __double2uint_rn(x);
-}
-
-template <>
-__device__ inline void typecast_double_to_torus<uint64_t>(double x,
-                                                          uint64_t &r) {
-  // The ull intrinsic does not behave in the same way on all architectures and
-  // on some platforms this causes the cmux tree test to fail
-  // Hence the intrinsic is not used here
-  uint128 nnnn = make_uint128_from_float(x);
-  uint64_t lll = nnnn.lo_;
-  r = lll;
-}
-
-template <typename T>
-__device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
-                                              uint32_t level_count) {
-  T shift = sizeof(T) * 8 - level_count * base_log;
-  T mask = 1ll << (shift - 1);
-  T b = (x & mask) >> (shift - 1);
-  T res = x >> shift;
-  res += b;
-  res <<= shift;
-  return res;
-}
-
-template <typename T>
-__device__ __forceinline__ void rescale_torus_element(T element, T &output,
-                                                      uint32_t log_shift) {
-  output =
-      round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
-            (double)log_shift);
-}
-
-template <typename T>
-__device__ __forceinline__ T rescale_torus_element(T element,
-                                                   uint32_t log_shift) {
-  return round((double)element / (double(std::numeric_limits<T>::max()) + 1.0) *
-               (double)log_shift);
-}
-
-template <>
-__device__ __forceinline__ void
-rescale_torus_element<uint32_t>(uint32_t element, uint32_t &output,
-                                uint32_t log_shift) {
-  output =
-      round(__uint2double_rn(element) /
-            (__uint2double_rn(std::numeric_limits<uint32_t>::max()) + 1.0) *
-            __uint2double_rn(log_shift));
-}
-
-template <>
-__device__ __forceinline__ void
-rescale_torus_element<uint64_t>(uint64_t element, uint64_t &output,
-                                uint32_t log_shift) {
-  output = round(__ull2double_rn(element) /
-                 (__ull2double_rn(std::numeric_limits<uint64_t>::max()) + 1.0) *
-                 __uint2double_rn(log_shift));
-}
-#endif // CNCRT_TORUS_H
--- a/backends/concrete-cuda/implementation/src/device.cu
+++ b/backends/concrete-cuda/implementation/src/device.cu
@@ -1,243 +0,0 @@
-#include "device.h"
-#include <cstdint>
-#include <cuda_runtime.h>
-
-/// Unsafe function to create a CUDA stream, must check first that GPU exists
-cudaStream_t *cuda_create_stream(uint32_t gpu_index) {
-  cudaSetDevice(gpu_index);
-  cudaStream_t *stream = new cudaStream_t;
-  cudaStreamCreate(stream);
-  return stream;
-}
-
-/// Unsafe function to destroy CUDA stream, must check first the GPU exists
-int cuda_destroy_stream(cudaStream_t *stream, uint32_t gpu_index) {
-  cudaSetDevice(gpu_index);
-  cudaStreamDestroy(*stream);
-  return 0;
-}
-
-/// Unsafe function that will try to allocate even if gpu_index is invalid
-/// or if there's not enough memory. A safe wrapper around it must call
-/// cuda_check_valid_malloc() first
-void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  cudaSetDevice(gpu_index);
-  void *ptr;
-  cudaMalloc((void **)&ptr, size);
-  check_cuda_error(cudaGetLastError());
-
-  return ptr;
-}
-
-/// Allocates a size-byte array at the device memory. Tries to do it
-/// asynchronously.
-void *cuda_malloc_async(uint64_t size, cudaStream_t *stream,
-                        uint32_t gpu_index) {
-  cudaSetDevice(gpu_index);
-  void *ptr;
-
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11020)
-  int support_async_alloc;
-  check_cuda_error(cudaDeviceGetAttribute(
-      &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
-
-  if (support_async_alloc) {
-    check_cuda_error(cudaMallocAsync((void **)&ptr, size, *stream));
-  } else {
-    check_cuda_error(cudaMalloc((void **)&ptr, size));
-  }
-#else
-  check_cuda_error(cudaMalloc((void **)&ptr, size));
-#endif
-  return ptr;
-}
-
-/// Checks that allocation is valid
-/// 0: valid
-/// -1: invalid, not enough memory in device
-/// -2: invalid, gpu index doesn't exist
-int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
-  size_t total_mem, free_mem;
-  cudaMemGetInfo(&free_mem, &total_mem);
-  if (size > free_mem) {
-    // error code: not enough memory
-    return -1;
-  }
-  return 0;
-}
-
-/// Returns
-///  -> 0 if Cooperative Groups is not supported.
-///  -> 1 otherwise
-int cuda_check_support_cooperative_groups() {
-  int cooperative_groups_supported = 0;
-  cudaDeviceGetAttribute(&cooperative_groups_supported,
-                         cudaDevAttrCooperativeLaunch, 0);
-
-  return cooperative_groups_supported > 0;
-}
-
-/// Tries to copy memory to the GPU asynchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
-                             cudaStream_t *stream, uint32_t gpu_index) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dest);
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
-  }
-
-  cudaSetDevice(gpu_index);
-  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, *stream));
-  return 0;
-}
-
-/// Synchronizes device
-/// 0: success
-/// -2: error, gpu index doesn't exist
-int cuda_synchronize_device(uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
-  cudaDeviceSynchronize();
-  return 0;
-}
-
-int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
-                      cudaStream_t *stream, uint32_t gpu_index) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dest);
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
-  }
-  cudaSetDevice(gpu_index);
-  cudaMemsetAsync(dest, val, size, *stream);
-  return 0;
-}
-
-/// Tries to copy memory to the GPU asynchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
-                             cudaStream_t *stream, uint32_t gpu_index) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, src);
-  if (attr.device != gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
-  }
-
-  cudaSetDevice(gpu_index);
-  check_cuda_error(
-      cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, *stream));
-  return 0;
-}
-
-/// Return number of GPUs available
-int cuda_get_number_of_gpus() {
-  int num_gpus;
-  cudaGetDeviceCount(&num_gpus);
-  return num_gpus;
-}
-
-/// Drop a cuda array
-int cuda_drop(void *ptr, uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
-  check_cuda_error(cudaFree(ptr));
-  return 0;
-}
-
-/// Drop a cuda array. Tries to do it asynchronously
-int cuda_drop_async(void *ptr, cudaStream_t *stream, uint32_t gpu_index) {
-
-  cudaSetDevice(gpu_index);
-#ifndef CUDART_VERSION
-#error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11020)
-  int support_async_alloc;
-  check_cuda_error(cudaDeviceGetAttribute(
-      &support_async_alloc, cudaDevAttrMemoryPoolsSupported, gpu_index));
-
-  if (support_async_alloc) {
-    check_cuda_error(cudaFreeAsync(ptr, *stream));
-  } else {
-    check_cuda_error(cudaFree(ptr));
-  }
-#else
-  check_cuda_error(cudaFree(ptr));
-#endif
-  return 0;
-}
-
-/// Get the maximum size for the shared memory
-int cuda_get_max_shared_memory(uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
-  cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, gpu_index);
-  int max_shared_memory = 0;
-  if (prop.major >= 6) {
-    max_shared_memory = prop.sharedMemPerMultiprocessor;
-  } else {
-    max_shared_memory = prop.sharedMemPerBlock;
-  }
-  return max_shared_memory;
-}
-
-int cuda_synchronize_stream(void *v_stream) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  cudaStreamSynchronize(*stream);
-  return 0;
-}
--- a/backends/concrete-cuda/implementation/src/fft/bnsmfft.cuh
+++ b/backends/concrete-cuda/implementation/src/fft/bnsmfft.cuh
@@ -1,739 +0,0 @@
-#ifndef GPU_BOOTSTRAP_FFT_CUH
-#define GPU_BOOTSTRAP_FFT_CUH
-
-#include "complex/operations.cuh"
-#include "polynomial/functions.cuh"
-#include "polynomial/parameters.cuh"
-#include "twiddles.cuh"
-
-/*
- * Direct negacyclic FFT:
- *   - before the FFT the N real coefficients are stored into a
- *     N/2 sized complex with the even coefficients in the real part
- *     and the odd coefficients in the imaginary part. This is referred to
- *     as the half-size FFT
- *   - when calling BNSMFFT_direct for the forward negacyclic FFT of PBS,
- *     opt is divided by 2 because the butterfly pattern is always applied
- *     between pairs of coefficients
- *   - instead of twisting each coefficient A_j before the FFT by
- *     multiplying by the w^j roots of unity (aka twiddles, w=exp(-i pi /N)),
- *     the FFT is modified, and for each level k of the FFT the twiddle:
- *     w_j,k = exp(-i pi j/2^k)
- *     is replaced with:
- *     \zeta_j,k = exp(-i pi (2j-1)/2^k)
- */
-template <class params> __device__ void NSMFFT_direct(double2 *A) {
-
-  /* We don't make bit reverse here, since twiddles are already reversed
-   *  Each thread is always in charge of "opt/2" pairs of coefficients,
-   *  which is why we always loop through N/2 by N/opt strides
-   *  The pragma unroll instruction tells the compiler to unroll the
-   *  full loop, which should increase performance
-   */
-
-  size_t tid = threadIdx.x;
-  size_t twid_id;
-  size_t i1, i2;
-  double2 u, v, w;
-  // level 1
-  // we don't make actual complex multiplication on level1 since we have only
-  // one twiddle, it's real and image parts are equal, so we can multiply
-  // it with simpler operations
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    i1 = tid;
-    i2 = tid + params::degree / 2;
-    u = A[i1];
-    v.x = (A[i2].x - A[i2].y) * 0.707106781186547461715008466854;
-    v.y = (A[i2].x + A[i2].y) * 0.707106781186547461715008466854;
-    A[i1].x += v.x;
-    A[i1].y += v.y;
-
-    A[i2].x = u.x - v.x;
-    A[i2].y = u.y - v.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 2
-  // from this level there are more than one twiddles and none of them has equal
-  // real and imag parts, so complete complex multiplication is needed
-  // for each level params::degree / 2^level represents number of coefficients
-  // inside divided chunk of specific level
-  //
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 4);
-    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
-    i2 = i1 + params::degree / 4;
-    w = negtwiddles[twid_id + 2];
-    u = A[i1];
-    v.x = A[i2].x * w.x - A[i2].y * w.y;
-    v.y = A[i2].y * w.x + A[i2].x * w.y;
-    A[i1].x += v.x;
-    A[i1].y += v.y;
-    A[i2].x = u.x - v.x;
-    A[i2].y = u.y - v.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 3
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 8);
-    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
-    i2 = i1 + params::degree / 8;
-    w = negtwiddles[twid_id + 4];
-    u = A[i1];
-    v.x = A[i2].x * w.x - A[i2].y * w.y;
-    v.y = A[i2].y * w.x + A[i2].x * w.y;
-    A[i1].x += v.x;
-    A[i1].y += v.y;
-    A[i2].x = u.x - v.x;
-    A[i2].y = u.y - v.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 4
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 16);
-    i1 =
-        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
-    i2 = i1 + params::degree / 16;
-    w = negtwiddles[twid_id + 8];
-    u = A[i1];
-    v.x = A[i2].x * w.x - A[i2].y * w.y;
-    v.y = A[i2].y * w.x + A[i2].x * w.y;
-    A[i1].x += v.x;
-    A[i1].y += v.y;
-    A[i2].x = u.x - v.x;
-    A[i2].y = u.y - v.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 5
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 32);
-    i1 =
-        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
-    i2 = i1 + params::degree / 32;
-    w = negtwiddles[twid_id + 16];
-    u = A[i1];
-    v.x = A[i2].x * w.x - A[i2].y * w.y;
-    v.y = A[i2].y * w.x + A[i2].x * w.y;
-    A[i1].x += v.x;
-    A[i1].y += v.y;
-    A[i2].x = u.x - v.x;
-    A[i2].y = u.y - v.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 6
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 64);
-    i1 =
-        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
-    i2 = i1 + params::degree / 64;
-    w = negtwiddles[twid_id + 32];
-    u = A[i1];
-    v.x = A[i2].x * w.x - A[i2].y * w.y;
-    v.y = A[i2].y * w.x + A[i2].x * w.y;
-    A[i1].x += v.x;
-    A[i1].y += v.y;
-    A[i2].x = u.x - v.x;
-    A[i2].y = u.y - v.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 7
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 128);
-    i1 = 2 * (params::degree / 128) * twid_id +
-         (tid & (params::degree / 128 - 1));
-    i2 = i1 + params::degree / 128;
-    w = negtwiddles[twid_id + 64];
-    u = A[i1];
-    v.x = A[i2].x * w.x - A[i2].y * w.y;
-    v.y = A[i2].y * w.x + A[i2].x * w.y;
-    A[i1].x += v.x;
-    A[i1].y += v.y;
-    A[i2].x = u.x - v.x;
-    A[i2].y = u.y - v.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // from level 8, we need to check size of params degree, because we support
-  // minimum actual polynomial size = 256,  when compressed size is halfed and
-  // minimum supported compressed size is 128, so we always need first 7
-  // levels of butterfly operation, since butterfly levels are hardcoded
-  // we need to check if polynomial size is big enough to require specific level
-  // of butterfly.
-  if constexpr (params::degree >= 256) {
-    // level 8
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 256);
-      i1 = 2 * (params::degree / 256) * twid_id +
-           (tid & (params::degree / 256 - 1));
-      i2 = i1 + params::degree / 256;
-      w = negtwiddles[twid_id + 128];
-      u = A[i1];
-      v.x = A[i2].x * w.x - A[i2].y * w.y;
-      v.y = A[i2].y * w.x + A[i2].x * w.y;
-      A[i1].x += v.x;
-      A[i1].y += v.y;
-      A[i2].x = u.x - v.x;
-      A[i2].y = u.y - v.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 512) {
-    // level 9
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 512);
-      i1 = 2 * (params::degree / 512) * twid_id +
-           (tid & (params::degree / 512 - 1));
-      i2 = i1 + params::degree / 512;
-      w = negtwiddles[twid_id + 256];
-      u = A[i1];
-      v.x = A[i2].x * w.x - A[i2].y * w.y;
-      v.y = A[i2].y * w.x + A[i2].x * w.y;
-      A[i1].x += v.x;
-      A[i1].y += v.y;
-      A[i2].x = u.x - v.x;
-      A[i2].y = u.y - v.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 1024) {
-    // level 10
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 1024);
-      i1 = 2 * (params::degree / 1024) * twid_id +
-           (tid & (params::degree / 1024 - 1));
-      i2 = i1 + params::degree / 1024;
-      w = negtwiddles[twid_id + 512];
-      u = A[i1];
-      v.x = A[i2].x * w.x - A[i2].y * w.y;
-      v.y = A[i2].y * w.x + A[i2].x * w.y;
-      A[i1].x += v.x;
-      A[i1].y += v.y;
-      A[i2].x = u.x - v.x;
-      A[i2].y = u.y - v.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 2048) {
-    // level 11
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 2048);
-      i1 = 2 * (params::degree / 2048) * twid_id +
-           (tid & (params::degree / 2048 - 1));
-      i2 = i1 + params::degree / 2048;
-      w = negtwiddles[twid_id + 1024];
-      u = A[i1];
-      v.x = A[i2].x * w.x - A[i2].y * w.y;
-      v.y = A[i2].y * w.x + A[i2].x * w.y;
-      A[i1].x += v.x;
-      A[i1].y += v.y;
-      A[i2].x = u.x - v.x;
-      A[i2].y = u.y - v.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 4096) {
-    // level 12
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 4096);
-      i1 = 2 * (params::degree / 4096) * twid_id +
-           (tid & (params::degree / 4096 - 1));
-      i2 = i1 + params::degree / 4096;
-      w = negtwiddles[twid_id + 2048];
-      u = A[i1];
-      v.x = A[i2].x * w.x - A[i2].y * w.y;
-      v.y = A[i2].y * w.x + A[i2].x * w.y;
-      A[i1].x += v.x;
-      A[i1].y += v.y;
-      A[i2].x = u.x - v.x;
-      A[i2].y = u.y - v.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  // compressed size = 8192 is actual polynomial size = 16384.
-  // from this size, twiddles can't fit in constant memory,
-  // so from here, butterfly operation access device memory.
-  if constexpr (params::degree >= 8192) {
-    // level 13
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 8192);
-      i1 = 2 * (params::degree / 8192) * twid_id +
-           (tid & (params::degree / 8192 - 1));
-      i2 = i1 + params::degree / 8192;
-      w = negtwiddles13[twid_id];
-      u = A[i1];
-      v.x = A[i2].x * w.x - A[i2].y * w.y;
-      v.y = A[i2].y * w.x + A[i2].x * w.y;
-      A[i1].x += v.x;
-      A[i1].y += v.y;
-      A[i2].x = u.x - v.x;
-      A[i2].y = u.y - v.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-}
-
-/*
- * negacyclic inverse fft
- */
-template <class params> __device__ void NSMFFT_inverse(double2 *A) {
-
-  /* We don't make bit reverse here, since twiddles are already reversed
-   *  Each thread is always in charge of "opt/2" pairs of coefficients,
-   *  which is why we always loop through N/2 by N/opt strides
-   *  The pragma unroll instruction tells the compiler to unroll the
-   *  full loop, which should increase performance
-   */
-
-  size_t tid = threadIdx.x;
-  size_t twid_id;
-  size_t i1, i2;
-  double2 u, w;
-
-  // divide input by compressed polynomial size
-  tid = threadIdx.x;
-  for (size_t i = 0; i < params::opt; ++i) {
-    A[tid].x *= 1. / params::degree;
-    A[tid].y *= 1. / params::degree;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // none of the twiddles have equal real and imag part, so
-  // complete complex multiplication has to be done
-  // here we have more than one twiddle
-  // mapping in backward fft is reversed
-  // butterfly operation is started from last level
-
-  // compressed size = 8192 is actual polynomial size = 16384.
-  // twiddles for this size can't fit in constant memory so
-  // butterfly operation for this level access device memory to fetch
-  // twiddles
-  if constexpr (params::degree >= 8192) {
-    // level 13
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 8192);
-      i1 = 2 * (params::degree / 8192) * twid_id +
-           (tid & (params::degree / 8192 - 1));
-      i2 = i1 + params::degree / 8192;
-      w = negtwiddles13[twid_id];
-      u.x = A[i1].x - A[i2].x;
-      u.y = A[i1].y - A[i2].y;
-      A[i1].x += A[i2].x;
-      A[i1].y += A[i2].y;
-
-      A[i2].x = u.x * w.x + u.y * w.y;
-      A[i2].y = u.y * w.x - u.x * w.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 4096) {
-    // level 12
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 4096);
-      i1 = 2 * (params::degree / 4096) * twid_id +
-           (tid & (params::degree / 4096 - 1));
-      i2 = i1 + params::degree / 4096;
-      w = negtwiddles[twid_id + 2048];
-      u.x = A[i1].x - A[i2].x;
-      u.y = A[i1].y - A[i2].y;
-      A[i1].x += A[i2].x;
-      A[i1].y += A[i2].y;
-
-      A[i2].x = u.x * w.x + u.y * w.y;
-      A[i2].y = u.y * w.x - u.x * w.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 2048) {
-    // level 11
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 2048);
-      i1 = 2 * (params::degree / 2048) * twid_id +
-           (tid & (params::degree / 2048 - 1));
-      i2 = i1 + params::degree / 2048;
-      w = negtwiddles[twid_id + 1024];
-      u.x = A[i1].x - A[i2].x;
-      u.y = A[i1].y - A[i2].y;
-      A[i1].x += A[i2].x;
-      A[i1].y += A[i2].y;
-
-      A[i2].x = u.x * w.x + u.y * w.y;
-      A[i2].y = u.y * w.x - u.x * w.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 1024) {
-    // level 10
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 1024);
-      i1 = 2 * (params::degree / 1024) * twid_id +
-           (tid & (params::degree / 1024 - 1));
-      i2 = i1 + params::degree / 1024;
-      w = negtwiddles[twid_id + 512];
-      u.x = A[i1].x - A[i2].x;
-      u.y = A[i1].y - A[i2].y;
-      A[i1].x += A[i2].x;
-      A[i1].y += A[i2].y;
-
-      A[i2].x = u.x * w.x + u.y * w.y;
-      A[i2].y = u.y * w.x - u.x * w.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 512) {
-    // level 9
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 512);
-      i1 = 2 * (params::degree / 512) * twid_id +
-           (tid & (params::degree / 512 - 1));
-      i2 = i1 + params::degree / 512;
-      w = negtwiddles[twid_id + 256];
-      u.x = A[i1].x - A[i2].x;
-      u.y = A[i1].y - A[i2].y;
-      A[i1].x += A[i2].x;
-      A[i1].y += A[i2].y;
-
-      A[i2].x = u.x * w.x + u.y * w.y;
-      A[i2].y = u.y * w.x - u.x * w.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  if constexpr (params::degree >= 256) {
-    // level 8
-    tid = threadIdx.x;
-#pragma unroll
-    for (size_t i = 0; i < params::opt / 2; ++i) {
-      twid_id = tid / (params::degree / 256);
-      i1 = 2 * (params::degree / 256) * twid_id +
-           (tid & (params::degree / 256 - 1));
-      i2 = i1 + params::degree / 256;
-      w = negtwiddles[twid_id + 128];
-      u.x = A[i1].x - A[i2].x;
-      u.y = A[i1].y - A[i2].y;
-      A[i1].x += A[i2].x;
-      A[i1].y += A[i2].y;
-
-      A[i2].x = u.x * w.x + u.y * w.y;
-      A[i2].y = u.y * w.x - u.x * w.y;
-      tid += params::degree / params::opt;
-    }
-    __syncthreads();
-  }
-
-  // below level 8, we don't need to check size of params degree, because we
-  // support minimum actual polynomial size = 256,  when compressed size is
-  // halfed and minimum supported compressed size is 128, so we always need
-  // last 7 levels of butterfly operation, since butterfly levels are hardcoded
-  // we don't need to check if polynomial size is big enough to require
-  // specific level of butterfly.
-  // level 7
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 128);
-    i1 = 2 * (params::degree / 128) * twid_id +
-         (tid & (params::degree / 128 - 1));
-    i2 = i1 + params::degree / 128;
-    w = negtwiddles[twid_id + 64];
-    u.x = A[i1].x - A[i2].x;
-    u.y = A[i1].y - A[i2].y;
-    A[i1].x += A[i2].x;
-    A[i1].y += A[i2].y;
-
-    A[i2].x = u.x * w.x + u.y * w.y;
-    A[i2].y = u.y * w.x - u.x * w.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 6
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 64);
-    i1 =
-        2 * (params::degree / 64) * twid_id + (tid & (params::degree / 64 - 1));
-    i2 = i1 + params::degree / 64;
-    w = negtwiddles[twid_id + 32];
-    u.x = A[i1].x - A[i2].x;
-    u.y = A[i1].y - A[i2].y;
-    A[i1].x += A[i2].x;
-    A[i1].y += A[i2].y;
-
-    A[i2].x = u.x * w.x + u.y * w.y;
-    A[i2].y = u.y * w.x - u.x * w.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 5
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 32);
-    i1 =
-        2 * (params::degree / 32) * twid_id + (tid & (params::degree / 32 - 1));
-    i2 = i1 + params::degree / 32;
-    w = negtwiddles[twid_id + 16];
-    u.x = A[i1].x - A[i2].x;
-    u.y = A[i1].y - A[i2].y;
-    A[i1].x += A[i2].x;
-    A[i1].y += A[i2].y;
-
-    A[i2].x = u.x * w.x + u.y * w.y;
-    A[i2].y = u.y * w.x - u.x * w.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 4
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 16);
-    i1 =
-        2 * (params::degree / 16) * twid_id + (tid & (params::degree / 16 - 1));
-    i2 = i1 + params::degree / 16;
-    w = negtwiddles[twid_id + 8];
-    u.x = A[i1].x - A[i2].x;
-    u.y = A[i1].y - A[i2].y;
-    A[i1].x += A[i2].x;
-    A[i1].y += A[i2].y;
-
-    A[i2].x = u.x * w.x + u.y * w.y;
-    A[i2].y = u.y * w.x - u.x * w.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 3
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 8);
-    i1 = 2 * (params::degree / 8) * twid_id + (tid & (params::degree / 8 - 1));
-    i2 = i1 + params::degree / 8;
-    w = negtwiddles[twid_id + 4];
-    u.x = A[i1].x - A[i2].x;
-    u.y = A[i1].y - A[i2].y;
-    A[i1].x += A[i2].x;
-    A[i1].y += A[i2].y;
-
-    A[i2].x = u.x * w.x + u.y * w.y;
-    A[i2].y = u.y * w.x - u.x * w.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 2
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 4);
-    i1 = 2 * (params::degree / 4) * twid_id + (tid & (params::degree / 4 - 1));
-    i2 = i1 + params::degree / 4;
-    w = negtwiddles[twid_id + 2];
-    u.x = A[i1].x - A[i2].x;
-    u.y = A[i1].y - A[i2].y;
-    A[i1].x += A[i2].x;
-    A[i1].y += A[i2].y;
-
-    A[i2].x = u.x * w.x + u.y * w.y;
-    A[i2].y = u.y * w.x - u.x * w.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // level 1
-  tid = threadIdx.x;
-#pragma unroll
-  for (size_t i = 0; i < params::opt / 2; ++i) {
-    twid_id = tid / (params::degree / 2);
-    i1 = 2 * (params::degree / 2) * twid_id + (tid & (params::degree / 2 - 1));
-    i2 = i1 + params::degree / 2;
-    w = negtwiddles[twid_id + 1];
-    u.x = A[i1].x - A[i2].x;
-    u.y = A[i1].y - A[i2].y;
-    A[i1].x += A[i2].x;
-    A[i1].y += A[i2].y;
-
-    A[i2].x = u.x * w.x + u.y * w.y;
-    A[i2].y = u.y * w.x - u.x * w.y;
-    tid += params::degree / params::opt;
-  }
-  __syncthreads();
-}
-
-/*
- * global batch fft
- * does fft in half size
- * unrolling half size fft result in half size + 1 elements
- * this function must be called with actual degree
- * function takes as input already compressed input
- */
-template <class params, sharedMemDegree SMD>
-__global__ void batch_NSMFFT(double2 *d_input, double2 *d_output,
-                             double2 *buffer) {
-  extern __shared__ double2 sharedMemoryFFT[];
-  double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
-                               : sharedMemoryFFT;
-  int tid = threadIdx.x;
-
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = d_input[blockIdx.x * (params::degree / 2) + tid];
-    tid = tid + params::degree / params::opt;
-  }
-  __syncthreads();
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  __syncthreads();
-
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
-    tid = tid + params::degree / params::opt;
-  }
-}
-
-/*
- * global batch polynomial multiplication
- * only used for fft tests
- * d_input1 and d_output must not have the same pointer
- * d_input1 can be modified inside the function
- */
-template <class params, sharedMemDegree SMD>
-__global__ void batch_polynomial_mul(double2 *d_input1, double2 *d_input2,
-                                     double2 *d_output, double2 *buffer) {
-  extern __shared__ double2 sharedMemoryFFT[];
-  double2 *fft = (SMD == NOSM) ? &buffer[blockIdx.x * params::degree / 2]
-                               : sharedMemoryFFT;
-
-  // Move first polynomial into shared memory(if possible otherwise it will
-  // be moved in device buffer)
-  int tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = d_input1[blockIdx.x * (params::degree / 2) + tid];
-    tid = tid + params::degree / params::opt;
-  }
-
-  // Perform direct negacyclic fourier transform
-  __syncthreads();
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  __syncthreads();
-
-  // Put the result of direct fft inside input1
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    d_input1[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
-    tid = tid + params::degree / params::opt;
-  }
-  __syncthreads();
-
-  // Move first polynomial into shared memory(if possible otherwise it will
-  // be moved in device buffer)
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] = d_input2[blockIdx.x * (params::degree / 2) + tid];
-    tid = tid + params::degree / params::opt;
-  }
-
-  // Perform direct negacyclic fourier transform on the second polynomial
-  __syncthreads();
-  NSMFFT_direct<HalfDegree<params>>(fft);
-  __syncthreads();
-
-  // calculate pointwise multiplication inside fft buffer
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    fft[tid] *= d_input1[blockIdx.x * (params::degree / 2) + tid];
-    tid = tid + params::degree / params::opt;
-  }
-
-  // Perform backward negacyclic fourier transform
-  __syncthreads();
-  NSMFFT_inverse<HalfDegree<params>>(fft);
-  __syncthreads();
-
-  // copy results in output buffer
-  tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    d_output[blockIdx.x * (params::degree / 2) + tid] = fft[tid];
-    tid = tid + params::degree / params::opt;
-  }
-}
-
-#endif // GPU_BOOTSTRAP_FFT_CUH
--- a/backends/concrete-cuda/implementation/src/fft/twiddles.cu
+++ b/backends/concrete-cuda/implementation/src/fft/twiddles.cu
--- a/backends/concrete-cuda/implementation/src/fft/twiddles.cuh
+++ b/backends/concrete-cuda/implementation/src/fft/twiddles.cuh
@@ -1,14 +0,0 @@
-
-#ifndef GPU_BOOTSTRAP_TWIDDLES_CUH
-#define GPU_BOOTSTRAP_TWIDDLES_CUH
-
-/*
- * 'negtwiddles' are stored in constant memory for faster access times
- * because of it's limited size, only twiddles for up to 2^12 polynomial size
- * can be stored there, twiddles for 2^13 are stored in device memory
- * 'negtwiddles13'
- */
-
-extern __constant__ double2 negtwiddles[4096];
-extern __device__ double2 negtwiddles13[4096];
-#endif
--- a/backends/concrete-cuda/implementation/src/keyswitch.cu
+++ b/backends/concrete-cuda/implementation/src/keyswitch.cu
@@ -1,98 +0,0 @@
-#include "keyswitch.cuh"
-#include "keyswitch.h"
-#include "polynomial/parameters.cuh"
-
-#include <cstdint>
-
-/* Perform keyswitch on a batch of 32 bits input LWE ciphertexts.
- * Head out to the equivalent operation on 64 bits for more details.
- */
-void cuda_keyswitch_lwe_ciphertext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
-      v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-      static_cast<uint32_t *>(lwe_array_in), static_cast<uint32_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
-}
-
-/* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
- *
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - lwe_array_out: output batch of num_samples keyswitched ciphertexts c =
- * (a0,..an-1,b) where n is the output LWE dimension (lwe_dimension_out)
- *  - lwe_array_in: input batch of num_samples LWE ciphertexts, containing
- * lwe_dimension_in mask values + 1 body value
- *  - ksk: the keyswitch key to be used in the operation
- *  - base log: the log of the base used in the decomposition (should be the one
- * used to create the ksk)
- *
- * This function calls a wrapper to a device kernel that performs the keyswitch
- * 	- num_samples blocks of threads are launched
- */
-void cuda_keyswitch_lwe_ciphertext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
-      v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_array_in), static_cast<uint64_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
-}
-
-/* Perform functional packing keyswitch on a batch of 32 bits input LWE
- * ciphertexts. See the equivalent function on 64 bit inputs for more details.
- */
-void cuda_fp_keyswitch_lwe_to_glwe_32(
-    void *v_stream, uint32_t gpu_index, void *glwe_array_out,
-    void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
-    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
-    uint32_t number_of_keys) {
-
-  cuda_fp_keyswitch_lwe_to_glwe(
-      v_stream, gpu_index, static_cast<uint32_t *>(glwe_array_out),
-      static_cast<uint32_t *>(lwe_array_in),
-      static_cast<uint32_t *>(fp_ksk_array), input_lwe_dimension,
-      output_glwe_dimension, output_polynomial_size, base_log, level_count,
-      number_of_input_lwe, number_of_keys);
-}
-
-/* Perform functional packing keyswitch on a batch of 64 bits input LWE
- * ciphertexts.
- *
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- * - `glwe_array_out`: output batch of keyswitched ciphertexts
- * - `lwe_array_in`: input batch of num_samples LWE ciphertexts, containing
- * lwe_dimension_in mask values + 1 body value
- *  - `fp_ksk_array`: the functional packing keyswitch keys to be used in the
- * operation
- *  - `base log`: the log of the base used in the decomposition (should be the
- * one used to create the ksk)
- *  - `level_count`: the number of levels used in the decomposition (should be
- * the one used to  create the fp_ksks).
- *  - `number_of_input_lwe`: the number of inputs
- *  - `number_of_keys`: the number of fp_ksks
- *
- * This function calls a wrapper to a device kernel that performs the functional
- * packing keyswitch.
- */
-void cuda_fp_keyswitch_lwe_to_glwe_64(
-    void *v_stream, uint32_t gpu_index, void *glwe_array_out,
-    void *lwe_array_in, void *fp_ksk_array, uint32_t input_lwe_dimension,
-    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t number_of_input_lwe,
-    uint32_t number_of_keys) {
-
-  cuda_fp_keyswitch_lwe_to_glwe(
-      v_stream, gpu_index, static_cast<uint64_t *>(glwe_array_out),
-      static_cast<uint64_t *>(lwe_array_in),
-      static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
-      output_glwe_dimension, output_polynomial_size, base_log, level_count,
-      number_of_input_lwe, number_of_keys);
-}
--- a/backends/concrete-cuda/implementation/src/keyswitch.cuh
+++ b/backends/concrete-cuda/implementation/src/keyswitch.cuh
@@ -1,228 +0,0 @@
-#ifndef CNCRT_KS_H
-#define CNCRT_KS_H
-
-#include "crypto/gadget.cuh"
-#include "crypto/torus.cuh"
-#include "polynomial/polynomial.cuh"
-#include <thread>
-#include <vector>
-
-template <typename Torus>
-__device__ Torus *get_ith_block(Torus *ksk, int i, int level,
-                                uint32_t lwe_dimension_out,
-                                uint32_t level_count) {
-  int pos = i * level_count * (lwe_dimension_out + 1) +
-            level * (lwe_dimension_out + 1);
-  Torus *ptr = &ksk[pos];
-  return ptr;
-}
-
-// blockIdx.y represents single lwe ciphertext
-// blockIdx.x represents chunk of lwe ciphertext,
-// chunk_count = glwe_size * polynomial_size / threads.
-// each threads will responsible to process only lwe_size times multiplication
-template <typename Torus>
-__global__ void
-fp_keyswitch(Torus *glwe_array_out, Torus *lwe_array_in, Torus *fp_ksk_array,
-             uint32_t lwe_dimension_in, uint32_t glwe_dimension,
-             uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-             uint32_t number_of_input_lwe, uint32_t number_of_keys) {
-  size_t tid = threadIdx.x;
-
-  size_t glwe_size = (glwe_dimension + 1);
-  size_t lwe_size = (lwe_dimension_in + 1);
-
-  // number of coefficients in a single fp-ksk
-  size_t ksk_size = lwe_size * level_count * glwe_size * polynomial_size;
-
-  // number of coefficients inside fp-ksk block for each lwe_input coefficient
-  size_t ksk_block_size = glwe_size * polynomial_size * level_count;
-
-  size_t ciphertext_id = blockIdx.y;
-  // number of coefficients processed inside single block
-  size_t coef_per_block = blockDim.x;
-  size_t chunk_id = blockIdx.x;
-  size_t ksk_id = ciphertext_id % number_of_keys;
-
-  extern __shared__ int8_t sharedmem[];
-
-  // result accumulator, shared memory is used because of frequent access
-  Torus *local_glwe_chunk = (Torus *)sharedmem;
-
-  // current input lwe ciphertext
-  auto cur_input_lwe = &lwe_array_in[ciphertext_id * lwe_size];
-  // current output glwe ciphertext
-  auto cur_output_glwe =
-      &glwe_array_out[ciphertext_id * glwe_size * polynomial_size];
-  // current out glwe chunk, will be processed inside single block
-  auto cur_glwe_chunk = &cur_output_glwe[chunk_id * coef_per_block];
-
-  // fp key used for current ciphertext
-  auto cur_ksk = &fp_ksk_array[ksk_id * ksk_size];
-
-  // set shared mem accumulator to 0
-  local_glwe_chunk[tid] = 0;
-
-  // iterate through each coefficient of  input lwe
-  for (size_t i = 0; i <= lwe_dimension_in; i++) {
-    Torus a_i =
-        round_to_closest_multiple(cur_input_lwe[i], base_log, level_count);
-
-    Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
-    Torus mod_b_mask = (1ll << base_log) - 1ll;
-
-    // block of key for current lwe coefficient (cur_input_lwe[i])
-    auto ksk_block = &cur_ksk[i * ksk_block_size];
-
-    // iterate through levels, calculating decomposition in reverse order
-    for (size_t j = 0; j < level_count; j++) {
-      auto ksk_glwe =
-          &ksk_block[(level_count - j - 1) * glwe_size * polynomial_size];
-      auto ksk_glwe_chunk = &ksk_glwe[chunk_id * coef_per_block];
-      Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
-      local_glwe_chunk[tid] -= decomposed * ksk_glwe_chunk[tid];
-    }
-  }
-  cur_glwe_chunk[tid] = local_glwe_chunk[tid];
-}
-
-/*
- * keyswitch kernel
- * Each thread handles a piece of the following equation:
- * $$GLWE_s2(\Delta.m+e) = (0,0,..,0,b) - \sum_{i=0,k-1} <Dec(a_i),
- * (GLWE_s2(s1_i q/beta),..,GLWE(s1_i q/beta^l)>$$ where k is the dimension of
- * the GLWE ciphertext. If the polynomial dimension in GLWE is > 1, this
- * equation is solved for each polynomial coefficient. where Dec denotes the
- * decomposition with base beta and l levels and the inner product is done
- * between the decomposition of a_i and l GLWE encryptions of s1_i q/\beta^j,
- * with j in [1,l] We obtain a GLWE encryption of Delta.m (with Delta the
- * scaling factor) under key s2 instead of s1, with an increased noise
- *
- */
-template <typename Torus>
-__global__ void keyswitch(Torus *lwe_array_out, Torus *lwe_array_in, Torus *ksk,
-                          uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-                          uint32_t base_log, uint32_t level_count,
-                          int lwe_lower, int lwe_upper, int cutoff) {
-  int tid = threadIdx.x;
-
-  extern __shared__ int8_t sharedmem[];
-
-  Torus *local_lwe_array_out = (Torus *)sharedmem;
-
-  auto block_lwe_array_in =
-      get_chunk(lwe_array_in, blockIdx.x, lwe_dimension_in + 1);
-  auto block_lwe_array_out =
-      get_chunk(lwe_array_out, blockIdx.x, lwe_dimension_out + 1);
-
-  auto gadget = GadgetMatrixSingle<Torus>(base_log, level_count);
-
-  int lwe_part_per_thd;
-  if (tid < cutoff) {
-    lwe_part_per_thd = lwe_upper;
-  } else {
-    lwe_part_per_thd = lwe_lower;
-  }
-  __syncthreads();
-
-  for (int k = 0; k < lwe_part_per_thd; k++) {
-    int idx = tid + k * blockDim.x;
-    local_lwe_array_out[idx] = 0;
-  }
-
-  if (tid == 0) {
-    local_lwe_array_out[lwe_dimension_out] =
-        block_lwe_array_in[lwe_dimension_in];
-  }
-
-  for (int i = 0; i < lwe_dimension_in; i++) {
-
-    __syncthreads();
-
-    Torus a_i =
-        round_to_closest_multiple(block_lwe_array_in[i], base_log, level_count);
-
-    Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
-    Torus mask_mod_b = (1ll << base_log) - 1ll;
-
-    for (int j = 0; j < level_count; j++) {
-      auto ksk_block = get_ith_block(ksk, i, j, lwe_dimension_out, level_count);
-      Torus decomposed = decompose_one<Torus>(state, mask_mod_b, base_log);
-      for (int k = 0; k < lwe_part_per_thd; k++) {
-        int idx = tid + k * blockDim.x;
-        local_lwe_array_out[idx] -= (Torus)ksk_block[idx] * decomposed;
-      }
-    }
-  }
-
-  for (int k = 0; k < lwe_part_per_thd; k++) {
-    int idx = tid + k * blockDim.x;
-    block_lwe_array_out[idx] = local_lwe_array_out[idx];
-  }
-}
-
-/// assume lwe_array_in in the gpu
-template <typename Torus>
-__host__ void cuda_keyswitch_lwe_ciphertext_vector(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus *lwe_array_in, Torus *ksk, uint32_t lwe_dimension_in,
-    uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
-
-  cudaSetDevice(gpu_index);
-  constexpr int ideal_threads = 128;
-
-  int lwe_dim = lwe_dimension_out + 1;
-  int lwe_lower, lwe_upper, cutoff;
-  if (lwe_dim % ideal_threads == 0) {
-    lwe_lower = lwe_dim / ideal_threads;
-    lwe_upper = lwe_dim / ideal_threads;
-    cutoff = 0;
-  } else {
-    int y =
-        ceil((double)lwe_dim / (double)ideal_threads) * ideal_threads - lwe_dim;
-    cutoff = ideal_threads - y;
-    lwe_lower = lwe_dim / ideal_threads;
-    lwe_upper = (int)ceil((double)lwe_dim / (double)ideal_threads);
-  }
-
-  int lwe_size_after = (lwe_dimension_out + 1) * num_samples;
-
-  int shared_mem = sizeof(Torus) * (lwe_dimension_out + 1);
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  cudaMemsetAsync(lwe_array_out, 0, sizeof(Torus) * lwe_size_after, *stream);
-
-  dim3 grid(num_samples, 1, 1);
-  dim3 threads(ideal_threads, 1, 1);
-
-  cudaFuncSetAttribute(keyswitch<Torus>,
-                       cudaFuncAttributeMaxDynamicSharedMemorySize, shared_mem);
-
-  keyswitch<<<grid, threads, shared_mem, *stream>>>(
-      lwe_array_out, lwe_array_in, ksk, lwe_dimension_in, lwe_dimension_out,
-      base_log, level_count, lwe_lower, lwe_upper, cutoff);
-  check_cuda_error(cudaGetLastError());
-}
-
-template <typename Torus>
-__host__ void cuda_fp_keyswitch_lwe_to_glwe(
-    void *v_stream, uint32_t gpu_index, Torus *glwe_array_out,
-    Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t number_of_input_lwe,
-    uint32_t number_of_keys) {
-  cudaSetDevice(gpu_index);
-  int threads = 256;
-  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
-  dim3 blocks(glwe_accumulator_size / threads, number_of_input_lwe, 1);
-
-  int shared_mem = sizeof(Torus) * threads;
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  fp_keyswitch<<<blocks, threads, shared_mem, *stream>>>(
-      glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
-      glwe_dimension, polynomial_size, base_log, level_count,
-      number_of_input_lwe, number_of_keys);
-}
-
-#endif
--- a/backends/concrete-cuda/implementation/src/multiplication.cu
+++ b/backends/concrete-cuda/implementation/src/multiplication.cu
@@ -1,183 +0,0 @@
-#include "multiplication.cuh"
-
-/*
- * Perform the multiplication of a u32 input LWE ciphertext vector with a u32
- * cleartext vector. See the equivalent operation on u64 data for more details.
- */
-void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *cleartext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count) {
-
-  host_cleartext_multiplication(
-      v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-      static_cast<uint32_t *>(lwe_array_in),
-      static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
-      input_lwe_ciphertext_count);
-}
-/*
- * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
- * input cleartext vector.
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- * - `lwe_array_out` is an array of size
- * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
- * been allocated on the GPU before calling this function, and that will hold
- * the result of the computation.
- * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
- * been allocated and initialized before calling this function. It has the same
- * size as the output array.
- * - `cleartext_array_in` is the cleartext vector used as input, it should have
- * been allocated and initialized before calling this function. It should be of
- * size `input_lwe_ciphertext_count`.
- * - `input_lwe_dimension` is the number of mask elements in the input and
- * output LWE ciphertext vectors
- * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in the
- * input LWE ciphertext vector, as well as in the output. It is also the number
- * of cleartexts in the input cleartext vector.
- *
- * Each cleartext of the input cleartext vector is multiplied to the mask and
- * body of the corresponding LWE ciphertext in the LWE ciphertext vector. The
- * result of the operation is stored in the output LWE ciphertext vector. The
- * two input vectors are unchanged. This function is a wrapper to a device
- * function that performs the operation on the GPU.
- */
-void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *cleartext_array_in, uint32_t input_lwe_dimension,
-    uint32_t input_lwe_ciphertext_count) {
-
-  host_cleartext_multiplication(
-      v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
-      static_cast<uint64_t *>(lwe_array_in),
-      static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
-      input_lwe_ciphertext_count);
-}
-
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the integer radix multiplication in keyswitch->bootstrap order.
- */
-void scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-    void *v_stream, uint32_t gpu_index, void *mem_ptr, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
-    PBS_TYPE pbs_type, uint32_t max_shared_memory, bool allocate_gpu_memory) {
-  switch (polynomial_size) {
-  case 2048:
-    scratch_cuda_integer_mult_radix_ciphertext_kb<uint64_t, Degree<2048>>(
-        v_stream, gpu_index, (int_mul_memory<uint64_t> *)mem_ptr,
-        message_modulus, carry_modulus, glwe_dimension, lwe_dimension,
-        polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
-        num_blocks, pbs_type, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Computes a multiplication between two 64 bit radix lwe ciphertexts
- * encrypting integer values. keyswitch -> bootstrap pattern is used, function
- * works for single pair of radix ciphertexts, 'v_stream' can be used for
- * parallelization
- * - 'v_stream' is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - 'gpu_index' is the index of the GPU to be used in the kernel launch
- * - 'radix_lwe_out' is 64 bit radix big lwe ciphertext, product of
- * multiplication
- * - 'radix_lwe_left' left radix big lwe ciphertext
- * - 'radix_lwe_right' right radix big lwe ciphertext
- * - 'ct_degree_out' degree for each lwe ciphertext block for out
- * RadixCiphertext
- * - 'ct_degree_left' degree for each lwe ciphertext block for left
- * RadixCiphertext
- * - 'ct_degree_right' degree for each lwe ciphertext block for right
- * RadixCiphertext
- * - 'bsk' bootstrapping key in fourier domain
- * - 'ksk' keyswitching key
- * - 'mem_ptr'
- * - 'message_modulus' message_modulus
- * - 'carry_modulus' carry_modulus
- * - 'glwe_dimension' glwe_dimension
- * - 'lwe_dimension' is the dimension of small lwe ciphertext
- * - 'polynomial_size' polynomial size
- * - 'pbs_base_log' base log used in the pbs
- * - 'pbs_level' decomposition level count used in the pbs
- * - 'ks_level' decomposition level count used in the keyswitch
- * - 'num_blocks' is the number of big lwe ciphertext blocks inside radix
- * ciphertext
- * - 'pbs_type' selects which PBS implementation should be used
- * - 'max_shared_memory' maximum shared memory per cuda block
- */
-void cuda_integer_mult_radix_ciphertext_kb_64(
-    void *v_stream, uint32_t gpu_index, void *radix_lwe_out,
-    void *radix_lwe_left, void *radix_lwe_right, uint32_t *ct_degree_out,
-    uint32_t *ct_degree_left, uint32_t *ct_degree_right, void *bsk, void *ksk,
-    void *mem_ptr, uint32_t message_modulus, uint32_t carry_modulus,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log,
-    uint32_t ks_level, uint32_t num_blocks, PBS_TYPE pbs_type,
-    uint32_t max_shared_memory) {
-
-  switch (polynomial_size) {
-  case 2048:
-    host_integer_mult_radix_kb<uint64_t, int64_t, AmortizedDegree<2048>>(
-        v_stream, gpu_index, (uint64_t *)radix_lwe_out,
-        (uint64_t *)radix_lwe_left, (uint64_t *)radix_lwe_right, ct_degree_out,
-        ct_degree_left, ct_degree_right, bsk, (uint64_t *)ksk,
-        (int_mul_memory<uint64_t> *)mem_ptr, message_modulus, carry_modulus,
-        glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        ks_base_log, ks_level, num_blocks, pbs_type, max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-void scratch_cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
-    void *mem_ptr, void *bsk, void *ksk, uint32_t message_modulus,
-    uint32_t carry_modulus, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level,
-    uint32_t ks_base_log, uint32_t ks_level, uint32_t num_blocks,
-    PBS_TYPE pbs_type, uint32_t max_shared_memory, bool allocate_gpu_memory) {
-  switch (polynomial_size) {
-  case 2048:
-    scratch_cuda_integer_mult_radix_ciphertext_kb_multi_gpu<uint64_t,
-                                                            Degree<2048>>(
-        (int_mul_memory<uint64_t> *)mem_ptr, (uint64_t *)bsk, (uint64_t *)ksk,
-        message_modulus, carry_modulus, glwe_dimension, lwe_dimension,
-        polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
-        num_blocks, pbs_type, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-void cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
-    void *radix_lwe_out, void *radix_lwe_left, void *radix_lwe_right,
-    uint32_t *ct_degree_out, uint32_t *ct_degree_left,
-    uint32_t *ct_degree_right, void *bsk, void *ksk, void *mem_ptr,
-    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log,
-    uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level,
-    uint32_t num_blocks, PBS_TYPE pbs_type, uint32_t max_shared_memory) {
-
-  switch (polynomial_size) {
-  case 2048:
-    host_integer_mult_radix_kb_multi_gpu<uint64_t, int64_t, Degree<2048>>(
-        (uint64_t *)radix_lwe_out, (uint64_t *)radix_lwe_left,
-        (uint64_t *)radix_lwe_right, ct_degree_out, ct_degree_left,
-        ct_degree_right, (uint64_t *)bsk, (uint64_t *)ksk,
-        (int_mul_memory<uint64_t> *)mem_ptr, message_modulus, carry_modulus,
-        glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        ks_base_log, ks_level, num_blocks, max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
--- a/backends/concrete-cuda/implementation/src/multiplication.cuh
+++ b/backends/concrete-cuda/implementation/src/multiplication.cuh
--- a/backends/concrete-cuda/implementation/src/negation.cu
+++ b/backends/concrete-cuda/implementation/src/negation.cu
@@ -1,49 +0,0 @@
-#include "negation.cuh"
-
-/*
- * Perform the negation of a u32 input LWE ciphertext vector.
- * See the equivalent operation on u64 ciphertexts for more details.
- */
-void cuda_negate_lwe_ciphertext_vector_32(void *v_stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count) {
-
-  host_negation(v_stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
-                static_cast<uint32_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
-}
-
-/*
- * Perform the negation of a u64 input LWE ciphertext vector.
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- * - `lwe_array_out` is an array of size
- * `(input_lwe_dimension + 1) * input_lwe_ciphertext_count` that should have
- * been allocated on the GPU before calling this function, and that will hold
- * the result of the computation.
- * - `lwe_array_in` is the LWE ciphertext vector used as input, it should have
- * been allocated and initialized before calling this function. It has the same
- * size as the output array.
- * - `input_lwe_dimension` is the number of mask elements in the two input and
- * in the output ciphertext vectors
- * - `input_lwe_ciphertext_count` is the number of ciphertexts contained in each
- * input LWE ciphertext vector, as well as in the output.
- *
- * Each element (mask element or body) of the input LWE ciphertext vector is
- * negated. The result is stored in the output LWE ciphertext vector. The input
- * LWE ciphertext vector is left unchanged. This function is a wrapper to a
- * device function that performs the operation on the GPU.
- */
-void cuda_negate_lwe_ciphertext_vector_64(void *v_stream, uint32_t gpu_index,
-                                          void *lwe_array_out,
-                                          void *lwe_array_in,
-                                          uint32_t input_lwe_dimension,
-                                          uint32_t input_lwe_ciphertext_count) {
-
-  host_negation(v_stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
-                static_cast<uint64_t *>(lwe_array_in), input_lwe_dimension,
-                input_lwe_ciphertext_count);
-}
--- a/backends/concrete-cuda/implementation/src/negation.cuh
+++ b/backends/concrete-cuda/implementation/src/negation.cuh
@@ -1,45 +0,0 @@
-#ifndef CUDA_NEGATE_H
-#define CUDA_NEGATE_H
-
-#ifdef __CDT_PARSER__
-#undef __CUDA_RUNTIME_H__
-#include <cuda_runtime.h>
-#endif
-
-#include "device.h"
-#include "linear_algebra.h"
-#include "utils/kernel_dimensions.cuh"
-
-template <typename T>
-__global__ void negation(T *output, T *input, uint32_t num_entries) {
-
-  int tid = threadIdx.x;
-  int index = blockIdx.x * blockDim.x + tid;
-  if (index < num_entries) {
-    // Here we take advantage of the wrapping behaviour of uint
-    output[index] = -input[index];
-  }
-}
-
-template <typename T>
-__host__ void host_negation(void *v_stream, uint32_t gpu_index, T *output,
-                            T *input, uint32_t input_lwe_dimension,
-                            uint32_t input_lwe_ciphertext_count) {
-
-  cudaSetDevice(gpu_index);
-  // lwe_size includes the presence of the body
-  // whereas lwe_dimension is the number of elements in the mask
-  int lwe_size = input_lwe_dimension + 1;
-  // Create a 1-dimensional grid of threads
-  int num_blocks = 0, num_threads = 0;
-  int num_entries = input_lwe_ciphertext_count * lwe_size;
-  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
-  dim3 grid(num_blocks, 1, 1);
-  dim3 thds(num_threads, 1, 1);
-
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  negation<<<grid, thds, 0, *stream>>>(output, input, num_entries);
-  check_cuda_error(cudaGetLastError());
-}
-
-#endif // CUDA_NEGATE_H
--- a/backends/concrete-cuda/implementation/src/polynomial/functions.cuh
+++ b/backends/concrete-cuda/implementation/src/polynomial/functions.cuh
@@ -1,304 +0,0 @@
-#ifndef GPU_POLYNOMIAL_FUNCTIONS
-#define GPU_POLYNOMIAL_FUNCTIONS
-#include "device.h"
-#include "utils/timer.cuh"
-
-// Return A if C == 0 and B if C == 1
-#define SEL(A, B, C) ((-(C) & ((A) ^ (B))) ^ (A))
-
-/*
- *  function compresses decomposed buffer into half size complex buffer for fft
- */
-template <class params>
-__device__ void real_to_complex_compressed(int16_t *src, double2 *dst) {
-  int tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    dst[tid].x = __int2double_rn(src[2 * tid]);
-    dst[tid].y = __int2double_rn(src[2 * tid + 1]);
-    tid += params::degree / params::opt;
-  }
-}
-
-/*
- * copy source polynomial to specific slice of batched polynomials
- * used only in low latency version
- */
-template <typename T, class params>
-__device__ void copy_into_ith_polynomial_low_lat(T *source, T *dst, int i) {
-  int tid = threadIdx.x;
-  int begin = i * (params::degree / 2 + 1);
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    dst[tid + begin] = source[tid];
-    tid = tid + params::degree / params::opt;
-  }
-
-  if (threadIdx.x == 0) {
-    dst[params::degree / 2 + begin] = source[params::degree / 2];
-  }
-}
-
-template <typename T, int elems_per_thread, int block_size>
-__device__ void copy_polynomial(T *source, T *dst) {
-  int tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < elems_per_thread; i++) {
-    dst[tid] = source[tid];
-    tid = tid + block_size;
-  }
-}
-
-/*
- * accumulates source polynomial into specific slice of batched polynomial
- * used only in low latency version
- */
-template <typename T, class params>
-__device__ void add_polynomial_inplace_low_lat(T *source, T *dst, int p_id) {
-  int tid = threadIdx.x;
-  int begin = p_id * (params::degree / 2 + 1);
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    dst[tid] += source[tid + begin];
-    tid = tid + params::degree / params::opt;
-  }
-
-  if (threadIdx.x == 0) {
-    dst[params::degree / 2] += source[params::degree / 2 + begin];
-  }
-}
-
-/*
- * Receives num_poly  concatenated polynomials of type T. For each:
- *
- * Performs acc = acc * (X^ä + 1) if zeroAcc = false
- * Performs acc = 0 if zeroAcc
- * takes single buffer and calculates inplace.
- *
- *  By default, it works on a single polynomial.
- */
-template <typename T, int elems_per_thread, int block_size>
-__device__ void divide_by_monomial_negacyclic_inplace(T *accumulator, T *input,
-                                                      uint32_t j, bool zeroAcc,
-                                                      uint32_t num_poly = 1) {
-  constexpr int degree = block_size * elems_per_thread;
-  for (int z = 0; z < num_poly; z++) {
-    T *accumulator_slice = (T *)accumulator + (ptrdiff_t)(z * degree);
-    T *input_slice = (T *)input + (ptrdiff_t)(z * degree);
-
-    int tid = threadIdx.x;
-    if (zeroAcc) {
-      for (int i = 0; i < elems_per_thread; i++) {
-        accumulator_slice[tid] = 0;
-        tid += block_size;
-      }
-    } else {
-      tid = threadIdx.x;
-      for (int i = 0; i < elems_per_thread; i++) {
-        if (j < degree) {
-          // if (tid < degree - j)
-          //  accumulator_slice[tid] = input_slice[tid + j];
-          // else
-          //  accumulator_slice[tid] = -input_slice[tid - degree + j];
-          int x = tid + j - SEL(degree, 0, tid < degree - j);
-          accumulator_slice[tid] =
-              SEL(-1, 1, tid < degree - j) * input_slice[x];
-        } else {
-          int32_t jj = j - degree;
-          // if (tid < degree - jj)
-          //  accumulator_slice[tid] = -input_slice[tid + jj];
-          // else
-          //  accumulator_slice[tid] = input_slice[tid - degree + jj];
-          int x = tid + jj - SEL(degree, 0, tid < degree - jj);
-          accumulator_slice[tid] =
-              SEL(1, -1, tid < degree - jj) * input_slice[x];
-        }
-        tid += block_size;
-      }
-    }
-  }
-}
-
-/*
- * Receives num_poly  concatenated polynomials of type T. For each:
- *
- * Performs result_acc = acc * (X^ä - 1) - acc
- * takes single buffer as input and returns a single rotated buffer
- *
- *  By default, it works on a single polynomial.
- */
-template <typename T, int elems_per_thread, int block_size>
-__device__ void multiply_by_monomial_negacyclic_and_sub_polynomial(
-    T *acc, T *result_acc, uint32_t j, uint32_t num_poly = 1) {
-  constexpr int degree = block_size * elems_per_thread;
-  for (int z = 0; z < num_poly; z++) {
-    T *acc_slice = (T *)acc + (ptrdiff_t)(z * degree);
-    T *result_acc_slice = (T *)result_acc + (ptrdiff_t)(z * degree);
-    int tid = threadIdx.x;
-    for (int i = 0; i < elems_per_thread; i++) {
-      if (j < degree) {
-        // if (tid < j)
-        //  result_acc_slice[tid] = -acc_slice[tid - j + degree]-acc_slice[tid];
-        // else
-        //  result_acc_slice[tid] = acc_slice[tid - j] - acc_slice[tid];
-        int x = tid - j + SEL(0, degree, tid < j);
-        result_acc_slice[tid] =
-            SEL(1, -1, tid < j) * acc_slice[x] - acc_slice[tid];
-      } else {
-        int32_t jj = j - degree;
-        // if (tid < jj)
-        //  result_acc_slice[tid] = acc_slice[tid - jj + degree]-acc_slice[tid];
-        // else
-        //  result_acc_slice[tid] = -acc_slice[tid - jj] - acc_slice[tid];
-        int x = tid - jj + SEL(0, degree, tid < jj);
-        result_acc_slice[tid] =
-            SEL(-1, 1, tid < jj) * acc_slice[x] - acc_slice[tid];
-      }
-      tid += block_size;
-    }
-  }
-}
-
-/*
- * Receives num_poly  concatenated polynomials of type T. For each performs a
- * rounding to increase accuracy of the PBS. Calculates inplace.
- *
- *  By default, it works on a single polynomial.
- */
-template <typename T, int elems_per_thread, int block_size>
-__device__ void round_to_closest_multiple_inplace(T *rotated_acc, int base_log,
-                                                  int level_count,
-                                                  uint32_t num_poly = 1) {
-  constexpr int degree = block_size * elems_per_thread;
-  for (int z = 0; z < num_poly; z++) {
-    T *rotated_acc_slice = (T *)rotated_acc + (ptrdiff_t)(z * degree);
-    int tid = threadIdx.x;
-    for (int i = 0; i < elems_per_thread; i++) {
-      T x_acc = rotated_acc_slice[tid];
-      T shift = sizeof(T) * 8 - level_count * base_log;
-      T mask = 1ll << (shift - 1);
-      T b_acc = (x_acc & mask) >> (shift - 1);
-      T res_acc = x_acc >> shift;
-      res_acc += b_acc;
-      res_acc <<= shift;
-      rotated_acc_slice[tid] = res_acc;
-      tid = tid + block_size;
-    }
-  }
-}
-
-template <typename Torus, class params>
-__device__ void add_to_torus(double2 *m_values, Torus *result,
-                             bool init_torus = false) {
-  Torus mx = (sizeof(Torus) == 4) ? UINT32_MAX : UINT64_MAX;
-  int tid = threadIdx.x;
-#pragma unroll
-  for (int i = 0; i < params::opt / 2; i++) {
-    double v1 = m_values[tid].x;
-    double v2 = m_values[tid].y;
-
-    double frac = v1 - floor(v1);
-    frac *= mx;
-    double carry = frac - floor(frac);
-    frac += (carry >= 0.5);
-
-    Torus V1 = 0;
-    typecast_double_to_torus<Torus>(frac, V1);
-
-    frac = v2 - floor(v2);
-    frac *= mx;
-    carry = frac - floor(v2);
-    frac += (carry >= 0.5);
-
-    Torus V2 = 0;
-    typecast_double_to_torus<Torus>(frac, V2);
-
-    if (init_torus) {
-      result[tid] = V1;
-      result[tid + params::degree / 2] = V2;
-    } else {
-      result[tid] += V1;
-      result[tid + params::degree / 2] += V2;
-    }
-    tid = tid + params::degree / params::opt;
-  }
-}
-
-// Extracts the body of a GLWE.
-// k is the offset to find the body element / polynomial in the lwe_array_out /
-// accumulator
-template <typename Torus, class params>
-__device__ void sample_extract_body(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t k) {
-  // Set first coefficient of the accumulator as the body of the LWE sample
-  lwe_array_out[k * params::degree] = accumulator[k * params::degree];
-}
-
-// Extracts the mask from num_poly polynomials individually
-template <typename Torus, class params>
-__device__ void sample_extract_mask(Torus *lwe_array_out, Torus *accumulator,
-                                    uint32_t num_poly = 1) {
-  for (int z = 0; z < num_poly; z++) {
-    Torus *lwe_array_out_slice =
-        (Torus *)lwe_array_out + (ptrdiff_t)(z * params::degree);
-    Torus *accumulator_slice =
-        (Torus *)accumulator + (ptrdiff_t)(z * params::degree);
-
-    // Set ACC = -ACC
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      accumulator_slice[tid] = -accumulator_slice[tid];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-
-    // Reverse the accumulator
-    tid = threadIdx.x;
-    Torus result[params::opt];
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      result[i] = accumulator_slice[params::degree - tid - 1];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      accumulator_slice[tid] = result[i];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-
-    // Perform ACC * X
-    // (equivalent to multiply_by_monomial_negacyclic_inplace(1))
-    tid = threadIdx.x;
-    result[params::opt];
-    for (int i = 0; i < params::opt; i++) {
-      // if (tid < 1)
-      //  result[i] = -accumulator_slice[tid - 1 + params::degree];
-      // else
-      //  result[i] = accumulator_slice[tid - 1];
-      int x = tid - 1 + SEL(0, params::degree, tid < 1);
-      result[i] = SEL(1, -1, tid < 1) * accumulator_slice[x];
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-    tid = threadIdx.x;
-    for (int i = 0; i < params::opt; i++) {
-      accumulator_slice[tid] = result[i];
-      tid += params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-
-    // Copy to the mask of the LWE sample
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      lwe_array_out_slice[tid] = accumulator_slice[tid];
-      tid = tid + params::degree / params::opt;
-    }
-  }
-}
-
-#endif
--- a/backends/concrete-cuda/implementation/src/polynomial/parameters.cuh
+++ b/backends/concrete-cuda/implementation/src/polynomial/parameters.cuh
@@ -1,106 +0,0 @@
-#ifndef CNCRT_PARAMETERS_H
-#define CNCRT_PARAMETERS_H
-
-constexpr int log2(int n) { return (n <= 2) ? 1 : 1 + log2(n / 2); }
-
-constexpr int choose_opt_amortized(int degree) {
-  if (degree <= 1024)
-    return 4;
-  else if (degree == 2048)
-    return 8;
-  else if (degree == 4096)
-    return 16;
-  else if (degree == 8192)
-    return 32;
-  else
-    return 64;
-}
-
-constexpr int choose_opt(int degree) {
-  if (degree <= 1024)
-    return 4;
-  else if (degree == 2048)
-    return 4;
-  else if (degree == 4096)
-    return 4;
-  else if (degree == 8192)
-    return 8;
-  else if (degree == 16384)
-    return 16;
-  else
-    return 64;
-}
-template <class params> class HalfDegree {
-public:
-  constexpr static int degree = params::degree / 2;
-  constexpr static int opt = params::opt / 2;
-  constexpr static int log2_degree = params::log2_degree - 1;
-  constexpr static int quarter = params::quarter / 2;
-  constexpr static int half = params::half / 2;
-  constexpr static int three_quarters = quarter + half;
-  constexpr static int warp = 32;
-  constexpr static int fft_sm_required = degree + degree / warp;
-};
-
-template <int N> class Degree {
-public:
-  constexpr static int degree = N;
-  constexpr static int opt = choose_opt(N);
-  constexpr static int log2_degree = log2(N);
-  constexpr static int quarter = N / 4;
-  constexpr static int half = N / 2;
-  constexpr static int three_quarters = half + quarter;
-  constexpr static int warp = 32;
-  constexpr static int fft_sm_required = N + 32;
-};
-
-template <int N> class AmortizedDegree {
-public:
-  constexpr static int degree = N;
-  constexpr static int opt = choose_opt_amortized(N);
-  constexpr static int log2_degree = log2(N);
-  constexpr static int quarter = N / 4;
-  constexpr static int half = N / 2;
-  constexpr static int three_quarters = half + quarter;
-  constexpr static int warp = 32;
-  constexpr static int fft_sm_required = N + 32;
-};
-enum sharedMemDegree {
-  NOSM = 0,
-  PARTIALSM = 1,
-  FULLSM = 2
-
-};
-
-class ForwardFFT {
-public:
-  constexpr static int direction = 0;
-};
-
-class BackwardFFT {
-public:
-  constexpr static int direction = 1;
-};
-
-class ReorderFFT {
-  constexpr static int reorder = 1;
-};
-class NoReorderFFT {
-  constexpr static int reorder = 0;
-};
-
-template <class params, class direction, class reorder = ReorderFFT>
-class FFTDegree : public params {
-public:
-  constexpr static int fft_direction = direction::direction;
-  constexpr static int fft_reorder = reorder::reorder;
-};
-
-template <int N, class direction, class reorder = ReorderFFT>
-class FFTParams : public Degree<N> {
-public:
-  constexpr static int fft_direction = direction::direction;
-  constexpr static int fft_reorder = reorder::reorder;
-};
-
-#endif // CNCRT_PARAMETERS_H
--- a/backends/concrete-cuda/implementation/src/polynomial/polynomial.cuh
+++ b/backends/concrete-cuda/implementation/src/polynomial/polynomial.cuh
@@ -1,259 +0,0 @@
-#ifndef CNCRT_POLYNOMIAL_H
-#define CNCRT_POLYNOMIAL_H
-
-#include "complex/operations.cuh"
-#include "crypto/torus.cuh"
-#include "device.h"
-#include "fft/bnsmfft.cuh"
-#include "parameters.cuh"
-#include "utils/timer.cuh"
-#include <cassert>
-#include <cstdint>
-
-#define PI 3.141592653589793238462643383279502884197
-
-template <typename T>
-__device__ T *get_chunk(T *data, int chunk_num, int chunk_size) {
-  int pos = chunk_num * chunk_size;
-  T *ptr = &data[pos];
-  return ptr;
-}
-
-class ExtraMemory {
-public:
-  uint32_t m_size;
-  __device__ ExtraMemory(uint32_t size) : m_size(size) {}
-};
-
-template <typename T, class params> class Polynomial;
-
-template <typename T, class params> class Vector;
-
-template <typename FT, class params> class Twiddles;
-
-template <typename T, class params> class Polynomial {
-public:
-  T *coefficients;
-  uint32_t degree;
-
-  __device__ Polynomial(T *coefficients, uint32_t degree)
-      : coefficients(coefficients), degree(degree) {}
-
-  __device__ Polynomial(int8_t *memory, uint32_t degree)
-      : coefficients((T *)memory), degree(degree) {}
-
-  __host__ void copy_to_host(T *dest) {
-    cudaMemcpyAsync(dest, this->coefficients, sizeof(T) * params::degree,
-                    cudaMemcpyDeviceToHost);
-  }
-
-  __device__ T get_coefficient(int i) { return this->coefficients[i]; }
-
-  __device__ int8_t *reuse_memory() { return (int8_t *)coefficients; }
-
-  __device__ void copy_coefficients_from(Polynomial<T, params> &source,
-                                         int begin_dest = 0,
-                                         int begin_src = 0) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      this->coefficients[tid + begin_dest] = source.coefficients[tid];
-      tid = tid + params::degree / params::opt;
-    }
-  }
-
-  __device__ void fill_with(T value) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      coefficients[tid] = value;
-      tid += params::degree / params::opt;
-    }
-  }
-
-  __device__ void round_to_closest_multiple_inplace(uint32_t base_log,
-                                                    uint32_t level_count) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-
-      T x = coefficients[tid];
-      T shift = sizeof(T) * 8 - level_count * base_log;
-      T mask = 1ll << (shift - 1);
-      T b = (x & mask) >> (shift - 1);
-      T res = x >> shift;
-      res += b;
-      res <<= shift;
-      coefficients[tid] = res;
-      tid = tid + params::degree / params::opt;
-    }
-  }
-
-  __device__ void multiply_by_scalar_inplace(T scalar) {
-    int tid = threadIdx.x;
-    const int grid_dim = blockDim.x;
-    const int slices = params::degree / grid_dim;
-    const int jump = grid_dim;
-    for (int i = 0; i < slices; i++) {
-      this->coefficients[tid] *= scalar;
-      tid += jump;
-    }
-  }
-
-  __device__ void add_scalar_inplace(T scalar) {
-    int tid = threadIdx.x;
-    const int grid_dim = blockDim.x;
-    const int slices = params::degree / grid_dim;
-    const int jump = grid_dim;
-    for (int i = 0; i < slices; i++) {
-      this->coefficients[tid] += scalar;
-      tid += jump;
-    }
-  }
-
-  __device__ void sub_scalar_inplace(T scalar) {
-    int tid = threadIdx.x;
-    const int grid_dim = blockDim.x;
-    const int slices = params::degree / grid_dim;
-    const int jump = grid_dim;
-    for (int i = 0; i < slices; i++) {
-      this->coefficients[tid] -= scalar;
-      tid += jump;
-    }
-  }
-
-  __device__ void sub_polynomial_inplace(Polynomial<T, params> &rhs) {
-    int tid = threadIdx.x;
-    const int grid_dim = blockDim.x;
-    const int slices = params::degree / grid_dim;
-    const int jump = grid_dim;
-    for (int i = 0; i < slices; i++) {
-      this->coefficients[tid] -= rhs.coefficients[tid];
-      tid += jump;
-    }
-  }
-
-  __device__ void negate_inplace() {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      coefficients[tid] = -coefficients[tid];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-
-  __device__ void copy_into(Vector<T, params> &vec) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      vec.m_data[tid] = coefficients[tid];
-      tid = tid + params::degree / params::opt;
-    }
-  }
-
-  __device__ void copy_reversed_into(Vector<T, params> &vec) {
-    int tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      vec.m_data[tid] = coefficients[params::degree - tid - 1];
-      tid = tid + params::degree / params::opt;
-    }
-  }
-
-  __device__ void reverse_inplace() {
-    int tid = threadIdx.x;
-    T result[params::opt];
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      result[i] = coefficients[params::degree - tid - 1];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-    tid = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < params::opt; i++) {
-      coefficients[tid] = result[i];
-      tid = tid + params::degree / params::opt;
-    }
-    synchronize_threads_in_block();
-  }
-};
-template <typename T, class params> class Vector {
-public:
-  T *m_data;
-  uint32_t m_size;
-
-  __device__ Vector(T *elements, uint32_t size)
-      : m_data(elements), m_size(size) {}
-
-  __host__ Vector() {}
-
-  __device__ T &operator[](int i) { return m_data[i]; }
-
-  __device__ Vector<T, params> get_chunk(int chunk_num, int chunk_size) {
-    int pos = chunk_num * chunk_size;
-    T *ptr = &m_data[pos];
-    return Vector<T, params>(ptr, chunk_size);
-  }
-
-  __host__ void copy_to_device(T *source, uint32_t elements) {
-    cudaMemcpyAsync(m_data, source, sizeof(T) * elements,
-                    cudaMemcpyHostToDevice);
-  }
-
-  __host__ void copy_to_host(T *dest) {
-    cudaMemcpyAsync(dest, m_data, sizeof(T) * m_size, cudaMemcpyDeviceToHost);
-  }
-
-  __host__ void copy_to_host(T *dest, int elements) {
-    cudaMemcpyAsync(dest, m_data, sizeof(T) * elements, cudaMemcpyDeviceToHost);
-  }
-
-  __device__ T get_ith_element(int i) { return m_data[i]; }
-
-  __device__ T get_last_element() { return m_data[m_size - 1]; }
-
-  __device__ void set_last_element(T elem) { m_data[m_size - 1] = elem; }
-
-  __device__ void operator-=(const Vector<T, params> &rhs) {
-    assert(m_size == rhs->m_size);
-    int tid = threadIdx.x;
-    int pos = tid;
-    int total = m_size / blockDim.x + 1;
-    for (int i = 0; i < total; i++) {
-      if (pos < m_size)
-        m_data[pos] -= rhs.m_data[pos];
-      pos += blockDim.x;
-    }
-  }
-
-  __device__ void operator*=(const T &rhs) {
-    int tid = threadIdx.x;
-    int pos = tid;
-    int total = m_size / blockDim.x + 1;
-    for (int i = 0; i < total; i++) {
-      if (pos < m_size)
-        m_data[pos] *= rhs;
-      pos += blockDim.x;
-    }
-  }
-};
-
-template <typename FT, class params> class Twiddles {
-public:
-  Vector<FT, params> twiddles2, twiddles3, twiddles4, twiddles5, twiddles6,
-      twiddles7, twiddles8, twiddles9, twiddles10;
-
-  __device__
-  Twiddles(Vector<FT, params> &twiddles2, Vector<FT, params> &twiddles3,
-           Vector<FT, params> &twiddles4, Vector<FT, params> &twiddles5,
-           Vector<FT, params> &twiddles6, Vector<FT, params> &twiddles7,
-           Vector<FT, params> &twiddles8, Vector<FT, params> &twiddles9,
-           Vector<FT, params> &twiddles10)
-      : twiddles2(twiddles2), twiddles3(twiddles3), twiddles4(twiddles4),
-        twiddles5(twiddles5), twiddles6(twiddles6), twiddles7(twiddles7),
-        twiddles8(twiddles8), twiddles9(twiddles9), twiddles10(twiddles10) {}
-};
-
-#endif // CNCRT_POLYNOMIAL_H
--- a/backends/concrete-cuda/implementation/src/polynomial/polynomial_math.cuh
+++ b/backends/concrete-cuda/implementation/src/polynomial/polynomial_math.cuh
@@ -1,76 +0,0 @@
-#ifndef CNCRT_POLYNOMIAL_MATH_H
-#define CNCRT_POLYNOMIAL_MATH_H
-
-#include "crypto/torus.cuh"
-#include "parameters.cuh"
-#include "polynomial.cuh"
-
-template <typename FT, class params>
-__device__ void sub_polynomial(FT *result, FT *first, FT *second) {
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
-    result[tid] = first[tid] - second[tid];
-    tid += params::degree / params::opt;
-  }
-}
-
-template <class params, typename T>
-__device__ void polynomial_product_in_fourier_domain(T *result, T *first,
-                                                     T *second) {
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    result[tid] = first[tid] * second[tid];
-    tid += params::degree / params::opt;
-  }
-
-  if (threadIdx.x == 0) {
-    result[params::degree / 2] =
-        first[params::degree / 2] * second[params::degree / 2];
-  }
-}
-
-// Computes result += first * second
-// If init_accumulator is set, assumes that result was not initialized and does
-// that with the outcome of first * second
-template <class params, typename T>
-__device__ void
-polynomial_product_accumulate_in_fourier_domain(T *result, T *first, T *second,
-                                                bool init_accumulator = false) {
-  int tid = threadIdx.x;
-  for (int i = 0; i < params::opt / 2; i++) {
-    if (init_accumulator)
-      result[tid] = first[tid] * second[tid];
-    else
-      result[tid] += first[tid] * second[tid];
-    tid += params::degree / params::opt;
-  }
-}
-
-// If init_accumulator is set, assumes that result was not initialized and does
-// that with the outcome of first * second
-template <typename T, class params>
-__device__ void
-polynomial_product_accumulate_by_monomial(T *result, T *poly,
-                                          uint64_t monomial_degree,
-                                          bool init_accumulator = false) {
-  // monomial_degree \in [0, 2 * params::degree)
-  int full_cycles_count = monomial_degree / params::degree;
-  int remainder_degrees = monomial_degree % params::degree;
-
-  int pos = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
-    T element = poly[pos];
-    int new_pos = (pos + monomial_degree) % params::degree;
-
-    T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
-    x = SEL(-x, x, new_pos >= remainder_degrees);
-
-    if (init_accumulator)
-      result[new_pos] = x;
-    else
-      result[new_pos] += x;
-    pos += params::degree / params::opt;
-  }
-}
-
-#endif // CNCRT_POLYNOMIAL_MATH_H
--- a/backends/concrete-cuda/implementation/src/polynomial/polynomial_twiddles.cuh
+++ b/backends/concrete-cuda/implementation/src/polynomial/polynomial_twiddles.cuh
@@ -1 +0,0 @@
-#include "polynomial.cuh"
--- a/backends/concrete-cuda/implementation/src/types/int128.cuh
+++ b/backends/concrete-cuda/implementation/src/types/int128.cuh
@@ -1,76 +0,0 @@
-#ifndef CNCRT_INT128_H
-#define CNCRT_INT128_H
-
-// abseil's int128 type
-// licensed under Apache license
-
-class uint128 {
-public:
-  __device__ uint128(uint64_t high, uint64_t low) : hi_(high), lo_(low) {}
-
-  uint64_t lo_;
-  uint64_t hi_;
-};
-
-class int128 {
-public:
-  int128() = default;
-
-  __device__ operator unsigned long long() const {
-    return static_cast<unsigned long long>(lo_);
-  }
-
-  __device__ int128(int64_t high, uint64_t low) : hi_(high), lo_(low) {}
-
-  uint64_t lo_;
-  int64_t hi_;
-};
-
-__device__ inline uint128 make_uint128(uint64_t high, uint64_t low) {
-  return uint128(high, low);
-}
-
-template <typename T> __device__ uint128 make_uint128_from_float(T v) {
-  if (v >= ldexp(static_cast<T>(1), 64)) {
-    uint64_t hi = static_cast<uint64_t>(ldexp(v, -64));
-    uint64_t lo = static_cast<uint64_t>(v - ldexp(static_cast<T>(hi), 64));
-    return make_uint128(hi, lo);
-  }
-
-  return make_uint128(0, static_cast<uint64_t>(v));
-}
-
-__device__ inline int128 make_int128(int64_t high, uint64_t low) {
-  return int128(high, low);
-}
-
-__device__ inline int64_t bitcast_to_signed(uint64_t v) {
-  return v & (uint64_t{1} << 63) ? ~static_cast<int64_t>(~v)
-                                 : static_cast<int64_t>(v);
-}
-
-__device__ inline uint64_t uint128_high64(uint128 v) { return v.hi_; }
-__device__ inline uint64_t uint128_low64(uint128 v) { return v.lo_; }
-
-__device__ __forceinline__ uint128 operator-(uint128 val) {
-  uint64_t hi = ~uint128_high64(val);
-  uint64_t lo = ~uint128_low64(val) + 1;
-  if (lo == 0)
-    ++hi; // carry
-  return make_uint128(hi, lo);
-}
-
-template <typename T> __device__ int128 make_int128_from_float(T v) {
-
-  // We must convert the absolute value and then negate as needed, because
-  // floating point types are typically sign-magnitude. Otherwise, the
-  // difference between the high and low 64 bits when interpreted as two's
-  // complement overwhelms the precision of the mantissa.
-  uint128 result =
-      v < 0 ? -make_uint128_from_float(-v) : make_uint128_from_float(v);
-
-  return make_int128(bitcast_to_signed(uint128_high64(result)),
-                     uint128_low64(result));
-}
-
-#endif
--- a/backends/concrete-cuda/implementation/src/utils/kernel_dimensions.cuh
+++ b/backends/concrete-cuda/implementation/src/utils/kernel_dimensions.cuh
@@ -1,21 +0,0 @@
-#ifndef KERNEL_DIMENSIONS_H
-#define KERNEL_DIMENSIONS_H
-
-inline int nextPow2(int x) {
-  --x;
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  return ++x;
-}
-
-inline void getNumBlocksAndThreads(const int n, const int maxBlockSize,
-                                   int &blocks, int &threads) {
-  threads =
-      (n < maxBlockSize * 2) ? max(128, nextPow2((n + 1) / 2)) : maxBlockSize;
-  blocks = (n + threads - 1) / threads;
-}
-
-#endif // KERNEL_DIMENSIONS_H
--- a/backends/concrete-cuda/implementation/src/utils/timer.cuh
+++ b/backends/concrete-cuda/implementation/src/utils/timer.cuh
@@ -1,30 +0,0 @@
-#ifndef CNCRT_TIMER_H
-#define CNCRT_TIMER_H
-
-#include <iostream>
-#define synchronize_threads_in_block() __syncthreads()
-
-template <bool active> class CudaMeasureExecution {
-public:
-  cudaEvent_t m_start, m_stop;
-
-  __host__ CudaMeasureExecution() {
-    if constexpr (active) {
-      cudaEventCreate(&m_start);
-      cudaEventCreate(&m_stop);
-      cudaEventRecord(m_start);
-    }
-  }
-
-  __host__ ~CudaMeasureExecution() {
-    if constexpr (active) {
-      float ms;
-      cudaEventRecord(m_stop);
-      cudaEventSynchronize(m_stop);
-      cudaEventElapsedTime(&ms, m_start, m_stop);
-      std::cout << "Execution took " << ms << "ms" << std::endl;
-    }
-  }
-};
-
-#endif // CNCRT_TIMER_H
--- a/backends/concrete-cuda/implementation/src/vertical_packing.cu
+++ b/backends/concrete-cuda/implementation/src/vertical_packing.cu
@@ -1,482 +0,0 @@
-#include "vertical_packing.cuh"
-#include "vertical_packing.h"
-#include <cassert>
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_fast_cmux_tree(int polynomial_size) {
-  assert((
-      "Error (GPU Cmux tree): polynomial size should be one of 256, 512, 1024, "
-      "2048, 4096, 8192",
-      polynomial_size == 256 || polynomial_size == 512 ||
-          polynomial_size == 1024 || polynomial_size == 2048 ||
-          polynomial_size == 4096 || polynomial_size == 8192));
-}
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_cmux_tree(int nbits, int polynomial_size, int base_log) {
-
-  assert(("Error (GPU Cmux tree): base log should be <= nbits",
-          base_log <= nbits));
-  checks_fast_cmux_tree(polynomial_size);
-}
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_blind_rotation_and_sample_extraction(int polynomial_size) {
-
-  assert(("Error (GPU Blind rotation + sample extraction): polynomial size "
-          "should be one of 256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the Cmux tree on 32 bits inputs, into `cmux_tree_buffer`. It also configures
- * SM options on the GPU in case FULLSM mode is going to be used.
- */
-void scratch_cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index,
-                               int8_t **cmux_tree_buffer,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t level_count,
-                               uint32_t lut_vector_size, uint32_t tau,
-                               uint32_t max_shared_memory,
-                               bool allocate_gpu_memory) {
-  checks_fast_cmux_tree(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_cmux_tree<uint32_t, int32_t, Degree<256>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_cmux_tree<uint32_t, int32_t, Degree<512>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_cmux_tree<uint32_t, int32_t, Degree<1024>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_cmux_tree<uint32_t, int32_t, Degree<2048>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_cmux_tree<uint32_t, int32_t, Degree<4096>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_cmux_tree<uint32_t, int32_t, Degree<8192>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the Cmux tree on 64 bits inputs, into `cmux_tree_buffer`. It also configures
- * SM options on the GPU in case FULLSM mode is going to be used.
- */
-void scratch_cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index,
-                               int8_t **cmux_tree_buffer,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t level_count,
-                               uint32_t lut_vector_size, uint32_t tau,
-                               uint32_t max_shared_memory,
-                               bool allocate_gpu_memory) {
-  checks_fast_cmux_tree(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_cmux_tree<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_cmux_tree<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_cmux_tree<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_cmux_tree<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_cmux_tree<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_cmux_tree<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, cmux_tree_buffer, glwe_dimension, polynomial_size,
-        level_count, lut_vector_size, tau, max_shared_memory,
-        allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Perform cmux tree on a batch of 32-bit input GGSW ciphertexts.
- * Check the equivalent function for 64-bit inputs for more details.
- */
-void cuda_cmux_tree_32(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
-                       void *ggsw_in, void *lut_vector,
-                       int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
-                       uint32_t polynomial_size, uint32_t base_log,
-                       uint32_t level_count, uint32_t lut_vector_size,
-                       uint32_t tau, uint32_t max_shared_memory) {
-
-  checks_cmux_tree(32, polynomial_size, base_log);
-
-  switch (polynomial_size) {
-  case 256:
-    host_cmux_tree<uint32_t, int32_t, Degree<256>>(
-        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
-        (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 512:
-    host_cmux_tree<uint32_t, int32_t, Degree<512>>(
-        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
-        (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 1024:
-    host_cmux_tree<uint32_t, int32_t, Degree<1024>>(
-        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
-        (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 2048:
-    host_cmux_tree<uint32_t, int32_t, Degree<2048>>(
-        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
-        (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 4096:
-    host_cmux_tree<uint32_t, int32_t, Degree<4096>>(
-        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
-        (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 8192:
-    host_cmux_tree<uint32_t, int32_t, Degree<8192>>(
-        v_stream, gpu_index, (uint32_t *)glwe_array_out, (uint32_t *)ggsw_in,
-        (uint32_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Perform Cmux tree on a batch of 64-bit input GGSW ciphertexts
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - 'glwe_array_out' output batch of GLWE buffer for Cmux tree, 'tau' GLWE's
- * will be the output of the function
- *  - 'ggsw_in' batch of input GGSW ciphertexts, function expects 'r' GGSW
- * ciphertexts as input.
- *  - 'lut_vector' batch of test vectors (LUTs) there should be 2^r LUTs
- * inside 'lut_vector' parameter
- *  - 'glwe_dimension' GLWE dimension, supported values: {1}
- *  - 'polynomial_size' size of the test polynomial, supported values: {512,
- * 1024, 2048, 4096, 8192}
- *  - 'base_log' base log parameter for cmux block
- *  - 'level_count' decomposition level for cmux block
- *  - 'lut_vector_size' number of elements in lut_vector
- *  - 'tau' number of input LWE ciphertext which were used to generate GGSW
- * ciphertexts stored in 'ggsw_in', it is also an amount of output GLWE
- * ciphertexts
- *  - 'max_shared_memory' maximum shared memory amount to be used for cmux
- *  kernel
- *
- * This function calls a wrapper to a device kernel that performs the
- * Cmux tree. The kernel is templatized based on integer discretization and
- * polynomial degree.
- */
-void cuda_cmux_tree_64(void *v_stream, uint32_t gpu_index, void *glwe_array_out,
-                       void *ggsw_in, void *lut_vector,
-                       int8_t *cmux_tree_buffer, uint32_t glwe_dimension,
-                       uint32_t polynomial_size, uint32_t base_log,
-                       uint32_t level_count, uint32_t lut_vector_size,
-                       uint32_t tau, uint32_t max_shared_memory) {
-  checks_cmux_tree(64, polynomial_size, base_log);
-
-  switch (polynomial_size) {
-  case 256:
-    host_cmux_tree<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 512:
-    host_cmux_tree<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 1024:
-    host_cmux_tree<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 2048:
-    host_cmux_tree<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 4096:
-    host_cmux_tree<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  case 8192:
-    host_cmux_tree<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, (uint64_t *)glwe_array_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, cmux_tree_buffer, glwe_dimension,
-        polynomial_size, base_log, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This cleanup function frees the data for the Cmux tree on GPU in
- * cmux_tree_buffer for 32 or 64 bits inputs.
- */
-void cleanup_cuda_cmux_tree(void *v_stream, uint32_t gpu_index,
-                            int8_t **cmux_tree_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*cmux_tree_buffer, stream, gpu_index);
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the Cmux tree on 32 bits inputs, into `br_se_buffer`. It also configures
- * SM options on the GPU in case FULLSM mode is going to be used.
- */
-void scratch_cuda_blind_rotation_sample_extraction_32(
-    void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  checks_blind_rotation_and_sample_extraction(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<256>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<512>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<1024>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<2048>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<4096>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_blind_rotation_sample_extraction<uint32_t, int32_t, Degree<8192>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the Cmux tree on 64 bits inputs, into `br_se_buffer`. It also configures
- * SM options on the GPU in case FULLSM mode is going to be used.
- */
-void scratch_cuda_blind_rotation_sample_extraction_64(
-    void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  checks_blind_rotation_and_sample_extraction(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_blind_rotation_sample_extraction<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, br_se_buffer, glwe_dimension, polynomial_size,
-        level_count, mbr_size, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Performs blind rotation on batch of 64-bit input ggsw ciphertexts
- * - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- * - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - 'lwe_out'  batch of output lwe ciphertexts, there should be 'tau'
- * ciphertexts inside 'lwe_out'
- *  - 'ggsw_in' batch of input ggsw ciphertexts, function expects 'mbr_size'
- * ggsw ciphertexts inside 'ggsw_in'
- *  - 'lut_vector' list of test vectors, function expects 'tau' test vectors
- * inside 'lut_vector' parameter
- *  - 'glwe_dimension' glwe dimension, supported values : {1}
- *  - 'polynomial_size' size of test polynomial supported sizes: {512, 1024,
- * 2048, 4096, 8192}
- *  - 'base_log' base log parameter
- *  - 'l_gadget' decomposition level
- *  - 'max_shared_memory' maximum number of shared memory to be used in
- * device functions(kernels)
- *
- * This function calls a wrapper to a device kernel that performs the
- * blind rotation and sample extraction. The kernel is templatized based on
- * integer discretization and polynomial degree.
- */
-void cuda_blind_rotate_and_sample_extraction_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_out, void *ggsw_in,
-    void *lut_vector, int8_t *br_se_buffer, uint32_t mbr_size, uint32_t tau,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t l_gadget, uint32_t max_shared_memory) {
-
-  checks_blind_rotation_and_sample_extraction(polynomial_size);
-  switch (polynomial_size) {
-  case 256:
-    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
-        polynomial_size, base_log, l_gadget, max_shared_memory);
-    break;
-  case 512:
-    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
-        polynomial_size, base_log, l_gadget, max_shared_memory);
-    break;
-  case 1024:
-    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
-        polynomial_size, base_log, l_gadget, max_shared_memory);
-    break;
-  case 2048:
-    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
-        polynomial_size, base_log, l_gadget, max_shared_memory);
-    break;
-  case 4096:
-    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
-        polynomial_size, base_log, l_gadget, max_shared_memory);
-    break;
-  case 8192:
-    host_blind_rotate_and_sample_extraction<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, (uint64_t *)lwe_out, (uint64_t *)ggsw_in,
-        (uint64_t *)lut_vector, br_se_buffer, mbr_size, tau, glwe_dimension,
-        polynomial_size, base_log, l_gadget, max_shared_memory);
-    break;
-  }
-}
-
-/*
- * This cleanup function frees the data for the blind rotation and sample
- * extraction on GPU in br_se_buffer for 32 or 64 bits inputs.
- */
-void cleanup_cuda_blind_rotation_sample_extraction(void *v_stream,
-                                                   uint32_t gpu_index,
-                                                   int8_t **br_se_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*br_se_buffer, stream, gpu_index);
-}
--- a/backends/concrete-cuda/implementation/src/vertical_packing.cuh
+++ b/backends/concrete-cuda/implementation/src/vertical_packing.cuh
@@ -1,615 +0,0 @@
-#ifndef VERTICAL_PACKING_CUH
-#define VERTICAL_PACKING_CUH
-
-#include "bootstrap.h"
-#include "complex/operations.cuh"
-#include "crypto/gadget.cuh"
-#include "crypto/ggsw.cuh"
-#include "crypto/torus.cuh"
-#include "device.h"
-#include "fft/bnsmfft.cuh"
-#include "fft/twiddles.cuh"
-#include "polynomial/functions.cuh"
-#include "polynomial/parameters.cuh"
-#include "polynomial/polynomial.cuh"
-#include "polynomial/polynomial_math.cuh"
-#include "utils/timer.cuh"
-
-/*
- * Receives an array of GLWE ciphertexts and two indexes to ciphertexts in this
- * array, and an array of GGSW ciphertexts with a index to one ciphertext in it.
- * Compute a CMUX with these operands and writes the output to a particular
- * index of glwe_array_out.
- *
- * This function needs polynomial_size threads per block.
- *
- * - glwe_array_out: An array where the result should be written to.
- * - glwe_array_in: An array where the GLWE inputs are stored.
- * - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
- * - selected_memory: An array to be used for the accumulators. Can be in the
- * shared memory or global memory.
- * - output_idx: The index of the output where the glwe ciphertext should be
- * written.
- * - input_idx1: The index of the first glwe ciphertext we will use.
- * - input_idx2: The index of the second glwe ciphertext we will use.
- * - glwe_dim: This is k.
- * - polynomial_size: size of the polynomials. This is N.
- * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
- * - level_count: number of decomposition levels in the gadget matrix (~4)
- * - ggsw_idx: The index of the GGSW we will use.
- */
-template <typename Torus, typename STorus, class params>
-__device__ void
-cmux(Torus *glwe_array_out, Torus *glwe_array_in, double2 *ggsw_in,
-     int8_t *selected_memory, uint32_t output_idx, uint32_t input_idx1,
-     uint32_t input_idx2, uint32_t glwe_dim, uint32_t polynomial_size,
-     uint32_t base_log, uint32_t level_count, uint32_t ggsw_idx) {
-
-  // Define glwe_sub
-  Torus *glwe_sub = (Torus *)selected_memory;
-
-  double2 *res_fft =
-      (double2 *)glwe_sub +
-      (glwe_dim + 1) * polynomial_size / (sizeof(double2) / sizeof(Torus));
-
-  double2 *glwe_fft =
-      (double2 *)res_fft + (ptrdiff_t)((glwe_dim + 1) * polynomial_size / 2);
-
-  /////////////////////////////////////
-
-  // glwe2-glwe1
-
-  // Gets the pointers for the global memory
-  auto m0 = &glwe_array_in[input_idx1 * (glwe_dim + 1) * polynomial_size];
-  auto m1 = &glwe_array_in[input_idx2 * (glwe_dim + 1) * polynomial_size];
-
-  // Subtraction: m1-m0
-  for (int i = 0; i < (glwe_dim + 1); i++) {
-    auto glwe_sub_slice = glwe_sub + i * params::degree;
-    auto m0_slice = m0 + i * params::degree;
-    auto m1_slice = m1 + i * params::degree;
-    sub_polynomial<Torus, params>(glwe_sub_slice, m1_slice, m0_slice);
-  }
-
-  // Initialize the polynomial multiplication via FFT arrays
-  // The polynomial multiplications happens at the block level
-  // and each thread handles two or more coefficients
-  int pos = threadIdx.x;
-  for (int i = 0; i < (glwe_dim + 1); i++)
-    for (int j = 0; j < params::opt / 2; j++) {
-      res_fft[pos].x = 0;
-      res_fft[pos].y = 0;
-      pos += params::degree / params::opt;
-    }
-
-  synchronize_threads_in_block();
-  GadgetMatrix<Torus, params> gadget(base_log, level_count, glwe_sub,
-                                     glwe_dim + 1);
-
-  // Subtract each glwe operand, decompose the resulting
-  // polynomial coefficients to multiply each decomposed level
-  // with the corresponding part of the LUT
-  for (int level = level_count - 1; level >= 0; level--) {
-    // Decomposition
-    for (int i = 0; i < (glwe_dim + 1); i++) {
-      gadget.decompose_and_compress_next_polynomial(glwe_fft, i);
-
-      // First, perform the polynomial multiplication
-      NSMFFT_direct<HalfDegree<params>>(glwe_fft);
-
-      // External product and accumulate
-      // Get the piece necessary for the multiplication
-      auto bsk_slice = get_ith_mask_kth_block(
-          ggsw_in, ggsw_idx, i, level, polynomial_size, glwe_dim, level_count);
-
-      // Perform the coefficient-wise product
-      for (int j = 0; j < (glwe_dim + 1); j++) {
-        auto bsk_poly = bsk_slice + j * params::degree / 2;
-        auto res_fft_poly = res_fft + j * params::degree / 2;
-        polynomial_product_accumulate_in_fourier_domain<params, double2>(
-            res_fft_poly, glwe_fft, bsk_poly);
-      }
-    }
-    synchronize_threads_in_block();
-  }
-
-  // IFFT
-  synchronize_threads_in_block();
-  for (int i = 0; i < (glwe_dim + 1); i++) {
-    auto res_fft_slice = res_fft + i * params::degree / 2;
-    NSMFFT_inverse<HalfDegree<params>>(res_fft_slice);
-  }
-  synchronize_threads_in_block();
-
-  // Write the output
-  Torus *mb = &glwe_array_out[output_idx * (glwe_dim + 1) * polynomial_size];
-
-  int tid = threadIdx.x;
-  for (int i = 0; i < (glwe_dim + 1); i++)
-    for (int j = 0; j < params::opt; j++) {
-      mb[tid] = m0[tid];
-      tid += params::degree / params::opt;
-    }
-
-  for (int i = 0; i < (glwe_dim + 1); i++) {
-    auto res_fft_slice = res_fft + i * params::degree / 2;
-    auto mb_slice = mb + i * params::degree;
-    add_to_torus<Torus, params>(res_fft_slice, mb_slice);
-  }
-}
-
-// Converts an array of plaintexts to trivially encrypted GLWEs.
-template <typename Torus, class params>
-__host__ void
-plaintext_to_glwe_array(Torus *lut_out, Torus *lut_in, uint32_t glwe_dimension,
-                        uint32_t lut_vector_size, uint32_t number_of_trees,
-                        cudaStream_t *stream) {
-
-  int r = log2(lut_vector_size) - params::log2_degree;
-  /*
-   * r < 0: No CMUX tree is needed, but the LUT is not big enough (i.e. has less
-   * than N elements).
-   *
-   * r == 0: No CMUX tree is needed and the LUT has exactly N
-   * elements.
-   *
-   * r > 0: CMUX tree is needed, so LUT is split in smaller LUTs of
-   * size lut_vector_size / num_lut.
-   *
-   * if r <= 0 we simply copy the LUT to lut_out, adding zeroes to the highest
-   * positions if needed.
-   */
-  int num_lut = std::max(1, 1 << r);
-  check_cuda_error(cudaMemsetAsync(lut_out, 0,
-                                   num_lut * number_of_trees *
-                                       (glwe_dimension + 1) * params::degree *
-                                       sizeof(Torus),
-                                   *stream));
-
-  uint32_t small_lut_size = lut_vector_size / num_lut;
-  for (uint32_t i = 0; i < number_of_trees * num_lut; i++)
-    check_cuda_error(cudaMemcpyAsync(
-        lut_out + ((glwe_dimension + 1) * i + glwe_dimension) * params::degree,
-        lut_in + i * small_lut_size, small_lut_size * sizeof(Torus),
-        cudaMemcpyDeviceToDevice, *stream));
-}
-
-/**
- * Computes several CMUXes using an array of GLWE ciphertexts and a single GGSW
- * ciphertext. The GLWE ciphertexts are picked two-by-two in sequence. Each
- * thread block computes a single CMUX.
- *
- * - glwe_array_out: An array where the result should be written to.
- * - glwe_array_in: An array where the GLWE inputs are stored.
- * - ggsw_in: An array where the GGSW input is stored. In the fourier domain.
- * - device_mem: An pointer for the global memory in case the shared memory is
- * not big enough to store the accumulators.
- * - device_memory_size_per_block: Memory size needed to store all accumulators
- * for a single block.
- * - glwe_dim: This is k.
- * - polynomial_size: size of the polynomials. This is N.
- * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
- * - level_count: number of decomposition levels in the gadget matrix (~4)
- * - ggsw_idx: The index of the GGSW we will use.
- */
-template <typename Torus, typename STorus, class params, sharedMemDegree SMD>
-__global__ void device_batch_cmux(Torus *glwe_array_out, Torus *glwe_array_in,
-                                  double2 *ggsw_in, int8_t *device_mem,
-                                  size_t device_memory_size_per_block,
-                                  uint32_t glwe_dim, uint32_t polynomial_size,
-                                  uint32_t base_log, uint32_t level_count,
-                                  uint32_t ggsw_idx, uint32_t num_lut) {
-
-  // We are running gridDim.y cmux trees in parallel
-  int tree_idx = blockIdx.y;
-  int tree_offset = tree_idx * num_lut * (glwe_dim + 1) * polynomial_size;
-
-  auto block_glwe_array_out = glwe_array_out + tree_offset;
-  auto block_glwe_array_in = glwe_array_in + tree_offset;
-
-  // The x-axis handles a single cmux tree. Each block computes one cmux.
-  int cmux_idx = blockIdx.x;
-  int output_idx = cmux_idx;
-  int input_idx1 = (cmux_idx << 1);
-  int input_idx2 = (cmux_idx << 1) + 1;
-
-  // We use shared memory for intermediate result
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-
-  if constexpr (SMD == FULLSM)
-    selected_memory = sharedmem;
-  else
-    selected_memory = &device_mem[(blockIdx.x + blockIdx.y * gridDim.x) *
-                                  device_memory_size_per_block];
-
-  cmux<Torus, STorus, params>(block_glwe_array_out, block_glwe_array_in,
-                              ggsw_in, selected_memory, output_idx, input_idx1,
-                              input_idx2, glwe_dim, polynomial_size, base_log,
-                              level_count, ggsw_idx);
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_memory_needed_per_block_cmux_tree(
-    uint32_t glwe_dimension, uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size * (glwe_dimension + 1) + // glwe_sub
-         sizeof(double2) * polynomial_size / 2 *
-             (glwe_dimension + 1) +             // res_fft
-         sizeof(double2) * polynomial_size / 2; // glwe_fft
-}
-
-template <typename Torus, typename params>
-__host__ __device__ uint64_t get_buffer_size_cmux_tree(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t lut_vector_size, uint32_t tau, uint32_t max_shared_memory) {
-
-  int r = log2(lut_vector_size) - params::log2_degree;
-  if (r <= 0)
-    // A cmux tree is not needed
-    return 0;
-  uint64_t memory_needed_per_block =
-      get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
-                                                   polynomial_size);
-  uint64_t num_lut = 1 << r;
-  uint64_t ggsw_size = polynomial_size * (glwe_dimension + 1) *
-                       (glwe_dimension + 1) * level_count;
-  uint64_t glwe_size = (glwe_dimension + 1) * polynomial_size;
-  uint64_t device_mem = 0;
-  if (max_shared_memory < memory_needed_per_block) {
-    device_mem = memory_needed_per_block * (1 << (r - 1)) * tau;
-  }
-  if (max_shared_memory < polynomial_size * sizeof(double)) {
-    device_mem += polynomial_size * sizeof(double);
-  }
-  uint64_t buffer_size =
-      r * ggsw_size * sizeof(double) +            // d_ggsw_fft_in
-      num_lut * tau * glwe_size * sizeof(Torus) + // d_buffer1
-      num_lut * tau * glwe_size * sizeof(Torus) + // d_buffer2
-      device_mem;                                 // d_mem
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void
-scratch_cmux_tree(void *v_stream, uint32_t gpu_index, int8_t **cmux_tree_buffer,
-                  uint32_t glwe_dimension, uint32_t polynomial_size,
-                  uint32_t level_count, uint32_t lut_vector_size, uint32_t tau,
-                  uint32_t max_shared_memory, bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t memory_needed_per_block =
-      get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
-                                                   polynomial_size);
-  if (max_shared_memory >= memory_needed_per_block) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_batch_cmux<Torus, STorus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
-    check_cuda_error(
-        cudaFuncSetCacheConfig(device_batch_cmux<Torus, STorus, params, FULLSM>,
-                               cudaFuncCachePreferShared));
-  }
-
-  if (allocate_gpu_memory) {
-    uint64_t buffer_size = get_buffer_size_cmux_tree<Torus, params>(
-        glwe_dimension, polynomial_size, level_count, lut_vector_size, tau,
-        max_shared_memory);
-    *cmux_tree_buffer =
-        (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-    check_cuda_error(cudaGetLastError());
-  }
-}
-
-/*
- * This kernel executes the CMUX tree used by the hybrid packing of the WoPBS.
- *
- * Uses shared memory for intermediate results
- *
- *  - v_stream: The CUDA stream that should be used.
- *  - glwe_array_out: A device array for the output GLWE ciphertext.
- *  - ggsw_in: A device array for the GGSW ciphertexts used in each layer.
- *  - lut_vector: A device array of cleartexts.
- * -  polynomial_size: size of the polynomials. This is N.
- *  - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
- *  - level_count: number of decomposition levels in the gadget matrix (~4)
- *  - lut_vector_size: Number of elements in lut_vector
- *  - tau: The quantity of CMUX trees that should be executed
- */
-template <typename Torus, typename STorus, class params>
-__host__ void host_cmux_tree(void *v_stream, uint32_t gpu_index,
-                             Torus *glwe_array_out, Torus *ggsw_in,
-                             Torus *lut_vector, int8_t *cmux_tree_buffer,
-                             uint32_t glwe_dimension, uint32_t polynomial_size,
-                             uint32_t base_log, uint32_t level_count,
-                             uint32_t lut_vector_size, uint32_t tau,
-                             uint32_t max_shared_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  if (lut_vector_size <= params::degree) {
-    // The LUT itself is the result
-    plaintext_to_glwe_array<Torus, params>(glwe_array_out, lut_vector,
-                                           glwe_dimension, lut_vector_size, tau,
-                                           stream);
-    return;
-  }
-  // r = tau * p - log2(N)
-  uint32_t r = log2(lut_vector_size) - params::log2_degree;
-  uint32_t num_lut = 1 << r;
-
-  uint64_t memory_needed_per_block =
-      get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
-                                                   polynomial_size);
-
-  dim3 thds(polynomial_size / params::opt, 1, 1);
-
-  //////////////////////
-  int ggsw_size = polynomial_size * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) * level_count;
-  int glwe_size = (glwe_dimension + 1) * polynomial_size;
-
-  // Define the buffers
-  // Always define the buffers with strongest memory alignment constraints first
-  // d_buffer1 and d_buffer2 are aligned with Torus, so they're defined last
-  double2 *d_ggsw_fft_in = (double2 *)cmux_tree_buffer;
-  int8_t *d_mem =
-      (int8_t *)d_ggsw_fft_in + (ptrdiff_t)(r * ggsw_size * sizeof(double));
-  int8_t *d_mem_fft = d_mem;
-  if (max_shared_memory < memory_needed_per_block) {
-    d_mem_fft =
-        d_mem + (ptrdiff_t)(memory_needed_per_block * num_lut / 2 * tau);
-  }
-  int8_t *d_buffer1 = d_mem_fft;
-  if (max_shared_memory < polynomial_size * sizeof(double)) {
-    d_buffer1 = d_mem_fft + (ptrdiff_t)(polynomial_size * sizeof(double));
-  }
-  int8_t *d_buffer2 =
-      d_buffer1 + (ptrdiff_t)(num_lut * tau * glwe_size * sizeof(Torus));
-
-  //////////////////////
-  batch_fft_ggsw_vector<Torus, STorus, params>(
-      stream, d_ggsw_fft_in, ggsw_in, d_mem_fft, r, glwe_dimension,
-      polynomial_size, level_count, gpu_index, max_shared_memory);
-
-  plaintext_to_glwe_array<Torus, params>((Torus *)d_buffer1, lut_vector,
-                                         glwe_dimension, lut_vector_size, tau,
-                                         stream);
-
-  Torus *output;
-  // Run the cmux tree
-  for (int layer_idx = 0; layer_idx < r; layer_idx++) {
-    output = (layer_idx % 2 ? (Torus *)d_buffer1 : (Torus *)d_buffer2);
-    Torus *input = (layer_idx % 2 ? (Torus *)d_buffer2 : (Torus *)d_buffer1);
-
-    int num_cmuxes = (1 << (r - 1 - layer_idx));
-    dim3 grid(num_cmuxes, tau, 1);
-
-    // walks horizontally through the leaves
-    if (max_shared_memory < memory_needed_per_block) {
-      device_batch_cmux<Torus, STorus, params, NOSM>
-          <<<grid, thds, 0, *stream>>>(output, input, d_ggsw_fft_in, d_mem,
-                                       memory_needed_per_block,
-                                       glwe_dimension, // k
-                                       polynomial_size, base_log, level_count,
-                                       layer_idx, // r
-                                       num_lut);
-    } else {
-      device_batch_cmux<Torus, STorus, params, FULLSM>
-          <<<grid, thds, memory_needed_per_block, *stream>>>(
-              output, input, d_ggsw_fft_in, d_mem, memory_needed_per_block,
-              glwe_dimension, // k
-              polynomial_size, base_log, level_count,
-              layer_idx, // r
-              num_lut);
-    }
-    check_cuda_error(cudaGetLastError());
-  }
-
-  for (int i = 0; i < tau; i++) {
-    check_cuda_error(cudaMemcpyAsync(
-        glwe_array_out + i * glwe_size, output + i * num_lut * glwe_size,
-        glwe_size * sizeof(Torus), cudaMemcpyDeviceToDevice, *stream));
-  }
-}
-
-/*
- * Receives "tau" GLWE ciphertexts as LUTs and "mbr_size" GGSWs. Each block
- * computes the blind rotation loop + sample extraction for a single LUT.
- * Writes the lwe output to lwe_out.
- *
- * This function needs polynomial_size/params::opt threads per block and tau
- * blocks
- *
- * - lwe_out: An array of lwe ciphertexts. The outcome is written here.
- * - glwe_in: An array of "tau" GLWE ciphertexts. These are the LUTs.
- * - ggsw_in: An array of "mbr_size" GGSWs in the fourier domain.
- * - mbr_size: The number of GGSWs.
- * - glwe_dim: This is k.
- * - polynomial_size: size of the polynomials. This is N.
- * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
- * - level_count: number of decomposition levels in the gadget matrix (~4)
- * - device_memory_size_per_sample: Amount of (shared/global) memory used for
- * the accumulators.
- * - device_mem: An array to be used for the accumulators. Can be in the shared
- * memory or global memory.
- */
-template <typename Torus, typename STorus, class params, sharedMemDegree SMD>
-__global__ void device_blind_rotation_and_sample_extraction(
-    Torus *lwe_out, Torus *glwe_in, double2 *ggsw_in, // m^BR
-    uint32_t mbr_size, uint32_t glwe_dim, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count,
-    size_t device_memory_size_per_sample, int8_t *device_mem) {
-
-  // We use shared memory for intermediate result
-  extern __shared__ int8_t sharedmem[];
-  int8_t *selected_memory;
-
-  if constexpr (SMD == FULLSM)
-    selected_memory = sharedmem;
-  else
-    selected_memory = &device_mem[blockIdx.x * device_memory_size_per_sample];
-
-  Torus *accumulator_c0 = (Torus *)selected_memory;
-  Torus *accumulator_c1 =
-      (Torus *)accumulator_c0 + (glwe_dim + 1) * polynomial_size;
-  int8_t *cmux_memory =
-      (int8_t *)(accumulator_c1 + (glwe_dim + 1) * polynomial_size);
-
-  // Input LUT
-  auto mi = &glwe_in[blockIdx.x * (glwe_dim + 1) * polynomial_size];
-  int tid = threadIdx.x;
-  for (int i = 0; i < (glwe_dim + 1); i++)
-    for (int j = 0; j < params::opt; j++) {
-      accumulator_c0[tid] = mi[tid];
-      tid += params::degree / params::opt;
-    }
-
-  int monomial_degree = 0;
-  for (int i = mbr_size - 1; i >= 0; i--) {
-    synchronize_threads_in_block();
-
-    // Compute x^ai * ACC
-    // Mask and Body
-    divide_by_monomial_negacyclic_inplace<Torus, params::opt,
-                                          params::degree / params::opt>(
-        accumulator_c1, accumulator_c0, (1 << monomial_degree), false,
-        (glwe_dim + 1));
-
-    monomial_degree += 1;
-
-    // ACC = CMUX ( Ci, x^ai * ACC, ACC )
-    synchronize_threads_in_block();
-    cmux<Torus, STorus, params>(accumulator_c0, accumulator_c0, ggsw_in,
-                                cmux_memory, 0, 0, 1, glwe_dim, polynomial_size,
-                                base_log, level_count, i);
-  }
-  synchronize_threads_in_block();
-
-  // Write the output
-  auto block_lwe_out = &lwe_out[blockIdx.x * (glwe_dim * polynomial_size + 1)];
-
-  // The blind rotation for this block is over
-  // Now we can perform the sample extraction: for the body it's just
-  // the resulting constant coefficient of the accumulator
-  // For the mask it's more complicated
-  sample_extract_mask<Torus, params>(block_lwe_out, accumulator_c0, glwe_dim);
-  sample_extract_body<Torus, params>(block_lwe_out, accumulator_c0, glwe_dim);
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t
-get_memory_needed_per_block_blind_rotation_sample_extraction(
-    uint32_t glwe_dimension, uint32_t polynomial_size) {
-  return sizeof(Torus) * polynomial_size *
-             (glwe_dimension + 1) + // accumulator_c0
-         sizeof(Torus) * polynomial_size *
-             (glwe_dimension + 1) + // accumulator_c1
-         +get_memory_needed_per_block_cmux_tree<Torus>(glwe_dimension,
-                                                       polynomial_size);
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_blind_rotation_sample_extraction(
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory) {
-
-  uint64_t memory_needed_per_block =
-      get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
-          glwe_dimension, polynomial_size);
-  uint64_t device_mem = 0;
-  if (max_shared_memory < memory_needed_per_block) {
-    device_mem = memory_needed_per_block * tau;
-  }
-  if (max_shared_memory < polynomial_size * sizeof(double)) {
-    device_mem += polynomial_size * sizeof(double);
-  }
-  int ggsw_size = polynomial_size * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) * level_count;
-  uint64_t buffer_size = mbr_size * ggsw_size * sizeof(double) // d_ggsw_fft_in
-                         + device_mem;
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void scratch_blind_rotation_sample_extraction(
-    void *v_stream, uint32_t gpu_index, int8_t **br_se_buffer,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t mbr_size, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t memory_needed_per_block =
-      get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
-          glwe_dimension, polynomial_size);
-  if (max_shared_memory >= memory_needed_per_block) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_blind_rotation_and_sample_extraction<Torus, STorus, params,
-                                                    FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, memory_needed_per_block));
-    check_cuda_error(cudaFuncSetCacheConfig(
-        device_blind_rotation_and_sample_extraction<Torus, STorus, params,
-                                                    FULLSM>,
-        cudaFuncCachePreferShared));
-  }
-
-  if (allocate_gpu_memory) {
-    uint64_t buffer_size =
-        get_buffer_size_blind_rotation_sample_extraction<Torus>(
-            glwe_dimension, polynomial_size, level_count, mbr_size, tau,
-            max_shared_memory);
-    *br_se_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-    check_cuda_error(cudaGetLastError());
-  }
-}
-
-template <typename Torus, typename STorus, class params>
-__host__ void host_blind_rotate_and_sample_extraction(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_out, Torus *ggsw_in,
-    Torus *lut_vector, int8_t *br_se_buffer, uint32_t mbr_size, uint32_t tau,
-    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t max_shared_memory) {
-
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint64_t memory_needed_per_block =
-      get_memory_needed_per_block_blind_rotation_sample_extraction<Torus>(
-          glwe_dimension, polynomial_size);
-
-  // Prepare the buffers
-  // Here all the buffers have double2 alignment
-  int ggsw_size = polynomial_size * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) * level_count;
-  double2 *d_ggsw_fft_in = (double2 *)br_se_buffer;
-  int8_t *d_mem_fft = (int8_t *)d_ggsw_fft_in +
-                      (ptrdiff_t)(mbr_size * ggsw_size * sizeof(double));
-  int8_t *d_mem = d_mem_fft;
-  if (max_shared_memory < polynomial_size * sizeof(double)) {
-    d_mem = d_mem_fft + (ptrdiff_t)(polynomial_size * sizeof(double));
-  }
-  // Apply the FFT on m^br
-  batch_fft_ggsw_vector<Torus, STorus, params>(
-      stream, d_ggsw_fft_in, ggsw_in, d_mem_fft, mbr_size, glwe_dimension,
-      polynomial_size, level_count, gpu_index, max_shared_memory);
-  check_cuda_error(cudaGetLastError());
-
-  dim3 thds(polynomial_size / params::opt, 1, 1);
-  dim3 grid(tau, 1, 1);
-
-  if (max_shared_memory < memory_needed_per_block)
-    device_blind_rotation_and_sample_extraction<Torus, STorus, params, NOSM>
-        <<<grid, thds, 0, *stream>>>(lwe_out, lut_vector, d_ggsw_fft_in,
-                                     mbr_size,
-                                     glwe_dimension, // k
-                                     polynomial_size, base_log, level_count,
-                                     memory_needed_per_block, d_mem);
-  else
-    device_blind_rotation_and_sample_extraction<Torus, STorus, params, FULLSM>
-        <<<grid, thds, memory_needed_per_block, *stream>>>(
-            lwe_out, lut_vector, d_ggsw_fft_in, mbr_size,
-            glwe_dimension, // k
-            polynomial_size, base_log, level_count, memory_needed_per_block,
-            d_mem);
-  check_cuda_error(cudaGetLastError());
-}
-#endif // VERTICAL_PACKING_CUH
--- a/backends/concrete-cuda/implementation/src/wop_bootstrap.cu
+++ b/backends/concrete-cuda/implementation/src/wop_bootstrap.cu
@@ -1,559 +0,0 @@
-#include "wop_bootstrap.cuh"
-#include <cmath>
-
-/*
- * Runs standard checks to validate the inputs
- */
-void checks_wop_pbs(int glwe_dimension, int polynomial_size,
-                    int level_count_bsk, int crt_decomposition_size,
-                    uint32_t *number_of_bits_to_extract_array) {
-  int total_bits_to_extract = 0;
-  for (int i = 0; i < crt_decomposition_size; i++) {
-    total_bits_to_extract += number_of_bits_to_extract_array[i];
-  }
-  assert(("Error (GPU WOP PBS): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-  // The number of inputs should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU WOP PBS): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 4 * (k "
-          "+ 1) * "
-          "level_count_bsk",
-          crt_decomposition_size <=
-              number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
-  assert(("Error (GPU WOP PBS): the number of inputs x the number of extracted "
-          "bits should be "
-          "larger than log2 of the polynomial size",
-          total_bits_to_extract >= log2(polynomial_size)));
-}
-
-void checks_fast_circuit_bootstrap_vertical_packing(int polynomial_size) {
-  assert(("Error (GPU circuit bootstrap): polynomial_size should be one of "
-          "256, 512, 1024, 2048, 4096, 8192",
-          polynomial_size == 256 || polynomial_size == 512 ||
-              polynomial_size == 1024 || polynomial_size == 2048 ||
-              polynomial_size == 4096 || polynomial_size == 8192));
-}
-
-void checks_circuit_bootstrap_vertical_packing(int glwe_dimension,
-                                               int polynomial_size,
-                                               int number_of_inputs,
-                                               int level_count_bsk) {
-  // The number of inputs should be lower than the number of streaming
-  // multiprocessors divided by (4 * (k + 1) * l) (the factor 4 being related
-  // to the occupancy of 50%). The only supported value for k is 1, so
-  // k + 1 = 2 for now.
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  assert(("Error (GPU extract bits): the number of input LWEs must be lower or "
-          "equal to the "
-          "number of streaming multiprocessors on the device divided by 4 * (k "
-          "+ 1) "
-          "level_count_bsk",
-          number_of_inputs <=
-              number_of_sm / 4. / (glwe_dimension + 1) / level_count_bsk));
-  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the circuit bootstrap and vertical packing on 32 bits inputs, into
- * `cbs_vp_buffer`. It also fills the value of delta_log to be used in the
- * circuit bootstrap.
- */
-void scratch_cuda_circuit_bootstrap_vertical_packing_32(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
-    uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t level_bsk, uint32_t level_count_cbs,
-    uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-
-  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<256>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<512>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<1024>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<2048>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<4096>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_circuit_bootstrap_vertical_packing<uint32_t, int32_t, Degree<8192>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the circuit bootstrap and vertical packing on 64 bits inputs, into
- * `cbs_vp_buffer`. It also fills the value of delta_log to be used in the
- * circuit bootstrap.
- */
-void scratch_cuda_circuit_bootstrap_vertical_packing_64(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
-    uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t level_bsk, uint32_t level_count_cbs,
-    uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-
-  checks_fast_circuit_bootstrap_vertical_packing(polynomial_size);
-
-  switch (polynomial_size) {
-  case 256:
-    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-        lwe_dimension, polynomial_size, level_bsk, level_count_cbs,
-        number_of_inputs, tau, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the wop PBS on 32 bits inputs, into `wop_pbs_buffer`. It also fills the value
- * of delta_log and cbs_delta_log to be used in the bit extract and circuit
- * bootstrap.
- */
-void scratch_cuda_wop_pbs_32(void *v_stream, uint32_t gpu_index,
-                             int8_t **wop_pbs_buffer, uint32_t *delta_log_array,
-                             uint32_t *cbs_delta_log, uint32_t glwe_dimension,
-                             uint32_t lwe_dimension, uint32_t polynomial_size,
-                             uint32_t level_count_cbs, uint32_t level_count_bsk,
-                             uint32_t *number_of_bits_to_extract_array,
-                             uint32_t crt_decomposition_size,
-                             uint32_t max_shared_memory,
-                             bool allocate_gpu_memory) {
-  checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
-                 crt_decomposition_size, number_of_bits_to_extract_array);
-  switch (polynomial_size) {
-  case 256:
-    scratch_wop_pbs<uint32_t, int32_t, Degree<256>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_wop_pbs<uint32_t, int32_t, Degree<512>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_wop_pbs<uint32_t, int32_t, Degree<1024>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_wop_pbs<uint32_t, int32_t, Degree<2048>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_wop_pbs<uint32_t, int32_t, Degree<4096>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_wop_pbs<uint32_t, int32_t, Degree<8192>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This scratch function allocates the necessary amount of data on the GPU for
- * the wop PBS on 64 bits inputs, into `wop_pbs_buffer`. It also fills the value
- * of delta_log and cbs_delta_log to be used in the bit extract and circuit
- * bootstrap.
- */
-void scratch_cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index,
-                             int8_t **wop_pbs_buffer, uint32_t *delta_log_array,
-                             uint32_t *cbs_delta_log, uint32_t glwe_dimension,
-                             uint32_t lwe_dimension, uint32_t polynomial_size,
-                             uint32_t level_count_cbs, uint32_t level_count_bsk,
-                             uint32_t *number_of_bits_to_extract_array,
-                             uint32_t crt_decomposition_size,
-                             uint32_t max_shared_memory,
-                             bool allocate_gpu_memory) {
-  checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
-                 crt_decomposition_size, number_of_bits_to_extract_array);
-  switch (polynomial_size) {
-  case 256:
-    scratch_wop_pbs<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 512:
-    scratch_wop_pbs<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 1024:
-    scratch_wop_pbs<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 2048:
-    scratch_wop_pbs<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 4096:
-    scratch_wop_pbs<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  case 8192:
-    scratch_wop_pbs<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, wop_pbs_buffer, delta_log_array, cbs_delta_log,
-        glwe_dimension, lwe_dimension, polynomial_size, level_count_cbs,
-        level_count_bsk, number_of_bits_to_extract_array,
-        crt_decomposition_size, max_shared_memory, allocate_gpu_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Entry point for cuda circuit bootstrap + vertical packing for batches of
- * input 64 bit LWE ciphertexts.
- *  - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- *  - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - 'lwe_array_out' list of output lwe ciphertexts
- *  - 'lwe_array_in' list of input lwe_ciphertexts
- *  - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
- * compressed complex key.
- *  - 'cbs_fpksk' list of private functional packing keyswitch keys
- *  - 'lut_vector' list of test vectors
- *  - 'cbs_vp_buffer' a pre-allocated array to store intermediate results
- *  - 'polynomial_size' size of the test polynomial, supported sizes:
- * {256, 512, 1024, 2048, 4096, 8192}
- *  - 'glwe_dimension' supported dimensions: {1}
- *  - 'lwe_dimension' dimension of input LWE ciphertexts
- *  - 'level_count_bsk' decomposition level for bootstrapping
- *  - 'base_log_bsk'  base log parameter for bootstrapping
- *  - 'level_count_pksk' decomposition level for fp-keyswitch
- *  - 'base_log_pksk' base log parameter for fp-keyswitch
- *  - 'level_count_cbs' level of circuit bootstrap
- *  - 'base_log_cbs' base log parameter for circuit bootstrap
- *  - 'number_of_inputs' number of input LWE ciphertexts
- *  - 'lut_number' number of LUTs given as input
- *  - 'max_shared_memory' maximum shared memory amount to be used in
- *  bootstrapping kernel
- *
- */
-void cuda_circuit_bootstrap_vertical_packing_64(
-    void *v_stream, uint32_t gpu_index, void *lwe_array_out, void *lwe_array_in,
-    void *fourier_bsk, void *cbs_fpksk, void *lut_vector, int8_t *cbs_vp_buffer,
-    uint32_t cbs_delta_log, uint32_t polynomial_size, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t level_count_bsk, uint32_t base_log_bsk,
-    uint32_t level_count_pksk, uint32_t base_log_pksk, uint32_t level_count_cbs,
-    uint32_t base_log_cbs, uint32_t number_of_inputs, uint32_t lut_number,
-    uint32_t max_shared_memory) {
-
-  checks_circuit_bootstrap_vertical_packing(glwe_dimension, polynomial_size,
-                                            number_of_inputs, level_count_bsk);
-
-  switch (polynomial_size) {
-  case 256:
-    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
-        cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
-        base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
-        max_shared_memory);
-    break;
-  case 512:
-    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
-        cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
-        base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
-        max_shared_memory);
-    break;
-  case 1024:
-    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
-        cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
-        base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
-        max_shared_memory);
-    break;
-  case 2048:
-    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
-        cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
-        base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
-        max_shared_memory);
-    break;
-  case 4096:
-    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
-        cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
-        base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
-        max_shared_memory);
-    break;
-  case 8192:
-    host_circuit_bootstrap_vertical_packing<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)cbs_fpksk, cbs_vp_buffer,
-        cbs_delta_log, glwe_dimension, lwe_dimension, polynomial_size,
-        base_log_bsk, level_count_bsk, base_log_pksk, level_count_pksk,
-        base_log_cbs, level_count_cbs, number_of_inputs, lut_number,
-        max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * Entry point for entire without padding programmable bootstrap on 64 bit input
- * LWE ciphertexts.
- *  - `v_stream` is a void pointer to the Cuda stream to be used in the kernel
- * launch
- *  - `gpu_index` is the index of the GPU to be used in the kernel launch
- *  - 'lwe_array_out' list of output lwe ciphertexts
- *  - 'lwe_array_in' list of input lwe_ciphertexts
- *  - 'lut_vector' list of test vectors
- *  - 'fourier_bsk' bootstrapping key in fourier domain, expected half size
- * compressed complex key.
- *  - 'ksk' keyswitch key to use inside extract bits block
- *  - 'cbs_fpksk' list of fp-keyswitch keys
- *  - 'wop_pbs_buffer' a pre-allocated array to store intermediate results
- *  - 'glwe_dimension' supported dimensions: {1}
- *  - 'lwe_dimension' dimension of input lwe ciphertexts
- *  - 'polynomial_size' size of the test polynomial, supported sizes:
- * {256, 512, 1024, 2048, 4096, 8192}
- *  - 'base_log_bsk'  base log parameter for bootstrapping
- *  - 'level_count_bsk' decomposition level for bootstrapping
- *  - 'base_log_ksk' base log parameter for keyswitch
- *  - 'level_count_ksk' decomposition level for keyswitch
- *  - 'base_log_pksk' base log parameter for fp-keyswitch
- *  - 'level_count_pksk' decomposition level for fp-keyswitch
- *  - 'base_log_cbs' base log parameter for circuit bootstrap
- *  - 'level_count_cbs' level of circuit bootstrap
- *  - 'number_of_bits_of_message_including_padding' number of bits to extract
- * from each input lwe ciphertext including padding bit
- *  - 'number_of_bits_to_extract' number of bits to extract
- * from each input lwe ciphertext without padding bit
- *  - 'crt_decomposition_size' number of input lwe ciphertexts
- *  - 'max_shared_memory' maximum shared memory amount to be used in
- *  bootstrapping kernel
- *
- */
-void cuda_wop_pbs_64(void *v_stream, uint32_t gpu_index, void *lwe_array_out,
-                     void *lwe_array_in, void *lut_vector, void *fourier_bsk,
-                     void *ksk, void *cbs_fpksk, int8_t *wop_pbs_buffer,
-                     uint32_t cbs_delta_log, uint32_t glwe_dimension,
-                     uint32_t lwe_dimension, uint32_t polynomial_size,
-                     uint32_t base_log_bsk, uint32_t level_count_bsk,
-                     uint32_t base_log_ksk, uint32_t level_count_ksk,
-                     uint32_t base_log_pksk, uint32_t level_count_pksk,
-                     uint32_t base_log_cbs, uint32_t level_count_cbs,
-                     uint32_t *number_of_bits_to_extract_array,
-                     uint32_t *delta_log_array, uint32_t crt_decomposition_size,
-                     uint32_t max_shared_memory) {
-  checks_wop_pbs(glwe_dimension, polynomial_size, level_count_bsk,
-                 crt_decomposition_size, number_of_bits_to_extract_array);
-  switch (polynomial_size) {
-  case 256:
-    host_wop_pbs<uint64_t, int64_t, Degree<256>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
-        wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
-        level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
-        level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 512:
-    host_wop_pbs<uint64_t, int64_t, Degree<512>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
-        wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
-        level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
-        level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 1024:
-    host_wop_pbs<uint64_t, int64_t, Degree<1024>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
-        wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
-        level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
-        level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 2048:
-    host_wop_pbs<uint64_t, int64_t, Degree<2048>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
-        wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
-        level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
-        level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 4096:
-    host_wop_pbs<uint64_t, int64_t, Degree<4096>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
-        wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
-        level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
-        level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  case 8192:
-    host_wop_pbs<uint64_t, int64_t, Degree<8192>>(
-        v_stream, gpu_index, (uint64_t *)lwe_array_out,
-        (uint64_t *)lwe_array_in, (uint64_t *)lut_vector,
-        (double2 *)fourier_bsk, (uint64_t *)ksk, (uint64_t *)cbs_fpksk,
-        wop_pbs_buffer, cbs_delta_log, glwe_dimension, lwe_dimension,
-        polynomial_size, base_log_bsk, level_count_bsk, base_log_ksk,
-        level_count_ksk, base_log_pksk, level_count_pksk, base_log_cbs,
-        level_count_cbs, number_of_bits_to_extract_array, delta_log_array,
-        crt_decomposition_size, max_shared_memory);
-    break;
-  default:
-    break;
-  }
-}
-
-/*
- * This cleanup function frees the data for the wop PBS on GPU in wop_pbs_buffer
- * for 32 or 64 bits inputs.
- */
-void cleanup_cuda_wop_pbs(void *v_stream, uint32_t gpu_index,
-                          int8_t **wop_pbs_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*wop_pbs_buffer, stream, gpu_index);
-}
-
-/*
- * This cleanup function frees the data for the circuit bootstrap and vertical
- * packing on GPU in cbs_vp_buffer for 32 or 64 bits inputs.
- */
-void cleanup_cuda_circuit_bootstrap_vertical_packing(void *v_stream,
-                                                     uint32_t gpu_index,
-                                                     int8_t **cbs_vp_buffer) {
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Free memory
-  cuda_drop_async(*cbs_vp_buffer, stream, gpu_index);
-}
--- a/backends/concrete-cuda/implementation/src/wop_bootstrap.cuh
+++ b/backends/concrete-cuda/implementation/src/wop_bootstrap.cuh
@@ -1,322 +0,0 @@
-#ifndef WOP_PBS_H
-#define WOP_PBS_H
-
-#include "bit_extraction.cuh"
-#include "bootstrap.h"
-#include "circuit_bootstrap.cuh"
-#include "device.h"
-#include "utils/kernel_dimensions.cuh"
-#include "utils/timer.cuh"
-#include "vertical_packing.cuh"
-
-template <typename Torus, class params>
-__global__ void device_build_lut(Torus *lut_out, Torus *lut_in,
-                                 uint32_t glwe_dimension, uint32_t lut_number) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < glwe_dimension * params::degree * lut_number) {
-    int lut_index = index / (glwe_dimension * params::degree);
-    for (int j = 0; j < glwe_dimension; j++) {
-      lut_out[index + lut_index * (glwe_dimension + 1) * params::degree +
-              j * params::degree] = 0;
-    }
-    lut_out[index + lut_index * (glwe_dimension + 1) * params::degree +
-            glwe_dimension * params::degree] = lut_in[index];
-  }
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_cbs_vp(uint32_t glwe_dimension,
-                                                    uint32_t polynomial_size,
-                                                    uint32_t level_count_cbs,
-                                                    uint32_t tau,
-                                                    uint32_t number_of_inputs) {
-
-  int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) * polynomial_size;
-  uint64_t buffer_size =
-      number_of_inputs * level_count_cbs * sizeof(Torus) + // lut_vector_indexes
-      number_of_inputs * ggsw_size * sizeof(Torus) +       // ggsw_out_cbs
-      tau * (glwe_dimension + 1) * polynomial_size *
-          sizeof(Torus); // glwe_array_out_cmux_tree
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void scratch_circuit_bootstrap_vertical_packing(
-    void *v_stream, uint32_t gpu_index, int8_t **cbs_vp_buffer,
-    uint32_t *cbs_delta_log, uint32_t glwe_dimension, uint32_t lwe_dimension,
-    uint32_t polynomial_size, uint32_t level_bsk, uint32_t level_count_cbs,
-    uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-  // Allocate lut vector indexes on the CPU first to avoid blocking the stream
-  Torus *h_lut_vector_indexes =
-      (Torus *)malloc(number_of_inputs * level_count_cbs * sizeof(Torus));
-  uint32_t mbr_size = std::min(params::log2_degree, (int)number_of_inputs);
-  uint64_t buffer_size =
-      get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
-                                    level_count_cbs, tau, number_of_inputs) +
-      get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension, polynomial_size,
-                                 level_count_cbs, number_of_inputs) +
-      get_buffer_size_bootstrap_low_latency<Torus>(
-          glwe_dimension, polynomial_size, level_bsk,
-          number_of_inputs * level_count_cbs, max_shared_memory) +
-      get_buffer_size_cmux_tree<Torus, params>(
-          glwe_dimension, polynomial_size, level_count_cbs,
-          1 << number_of_inputs, tau, max_shared_memory) +
-      get_buffer_size_blind_rotation_sample_extraction<Torus>(
-          glwe_dimension, polynomial_size, level_count_cbs, mbr_size, tau,
-          max_shared_memory);
-  // allocate device pointer for circuit bootstrap and vertical
-  // packing
-  if (allocate_gpu_memory) {
-    *cbs_vp_buffer =
-        (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-  }
-  // indexes of lut vectors for cbs
-  for (uint index = 0; index < level_count_cbs * number_of_inputs; index++) {
-    h_lut_vector_indexes[index] = index % level_count_cbs;
-  }
-  // lut_vector_indexes is the last buffer in the cbs_vp_buffer
-  uint64_t lut_vector_indexes_size =
-      number_of_inputs * level_count_cbs * sizeof(Torus);
-  int8_t *d_lut_vector_indexes =
-      (int8_t *)*cbs_vp_buffer +
-      (ptrdiff_t)(buffer_size - lut_vector_indexes_size);
-  cuda_memcpy_async_to_gpu((Torus *)d_lut_vector_indexes, h_lut_vector_indexes,
-                           lut_vector_indexes_size, stream, gpu_index);
-  check_cuda_error(cudaStreamSynchronize(*stream));
-  free(h_lut_vector_indexes);
-  check_cuda_error(cudaGetLastError());
-
-  uint32_t bits = sizeof(Torus) * 8;
-  *cbs_delta_log = (bits - 1);
-  scratch_circuit_bootstrap<Torus, STorus, params>(
-      v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, lwe_dimension,
-      polynomial_size, level_bsk, level_count_cbs, number_of_inputs,
-      max_shared_memory, false);
-  scratch_cmux_tree<Torus, STorus, params>(
-      v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size,
-      level_count_cbs, number_of_inputs, tau, max_shared_memory, false);
-  scratch_blind_rotation_sample_extraction<Torus, STorus, params>(
-      v_stream, gpu_index, cbs_vp_buffer, glwe_dimension, polynomial_size,
-      level_count_cbs, mbr_size, tau, max_shared_memory, false);
-}
-
-// number_of_inputs is the total number of LWE ciphertexts passed to CBS + VP,
-// i.e. tau * p where tau is the number of LUTs (the original number of LWEs
-// before bit extraction) and p is the number of extracted bits
-template <typename Torus, typename STorus, class params>
-__host__ void host_circuit_bootstrap_vertical_packing(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk,
-    Torus *cbs_fpksk, int8_t *cbs_vp_buffer, uint32_t cbs_delta_log,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_pksk,
-    uint32_t level_count_pksk, uint32_t base_log_cbs, uint32_t level_count_cbs,
-    uint32_t number_of_inputs, uint32_t tau, uint32_t max_shared_memory) {
-
-  // Define the buffers
-  // Always define the buffers with strongest memory alignment requirement first
-  // Here the only requirement is that lut_vector_indexes should be defined
-  // last, since all the other buffers are aligned with double2 (all buffers
-  // with a size that's a multiple of polynomial_size * sizeof(Torus) are
-  // aligned with double2)
-  int ggsw_size = level_count_cbs * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) * polynomial_size;
-
-  int8_t *cbs_buffer = (int8_t *)cbs_vp_buffer;
-  int8_t *ggsw_out_cbs =
-      cbs_buffer +
-      (ptrdiff_t)(get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
-                                             polynomial_size, level_count_cbs,
-                                             number_of_inputs) +
-                  get_buffer_size_bootstrap_low_latency<Torus>(
-                      glwe_dimension, polynomial_size, level_count_bsk,
-                      number_of_inputs * level_count_cbs, max_shared_memory));
-  // number_of_inputs = tau * p is the total number of GGSWs
-  // split the vec of GGSW in two, the msb GGSW is for the CMux tree and the
-  // lsb GGSW is for the last blind rotation.
-  uint32_t mbr_size = std::min(params::log2_degree, (int)number_of_inputs);
-
-  int8_t *cmux_tree_buffer =
-      ggsw_out_cbs + (ptrdiff_t)(number_of_inputs * ggsw_size * sizeof(Torus));
-  int8_t *glwe_array_out_cmux_tree =
-      cmux_tree_buffer + (ptrdiff_t)(get_buffer_size_cmux_tree<Torus, params>(
-                             glwe_dimension, polynomial_size, level_count_cbs,
-                             1 << number_of_inputs, tau, max_shared_memory));
-  int8_t *br_se_buffer =
-      glwe_array_out_cmux_tree +
-      (ptrdiff_t)(tau * (glwe_dimension + 1) * polynomial_size * sizeof(Torus));
-  Torus *lut_vector_indexes =
-      (Torus *)br_se_buffer +
-      (ptrdiff_t)(get_buffer_size_blind_rotation_sample_extraction<Torus>(
-                      glwe_dimension, polynomial_size, level_count_cbs,
-                      mbr_size, tau, max_shared_memory) /
-                  sizeof(Torus));
-
-  // Circuit bootstrap
-  host_circuit_bootstrap<Torus, params>(
-      v_stream, gpu_index, (Torus *)ggsw_out_cbs, lwe_array_in, fourier_bsk,
-      cbs_fpksk, lut_vector_indexes, cbs_buffer, cbs_delta_log, polynomial_size,
-      glwe_dimension, lwe_dimension, level_count_bsk, base_log_bsk,
-      level_count_pksk, base_log_pksk, level_count_cbs, base_log_cbs,
-      number_of_inputs, max_shared_memory);
-  check_cuda_error(cudaGetLastError());
-
-  // CMUX Tree
-  uint64_t lut_vector_size = (1 << number_of_inputs);
-  host_cmux_tree<Torus, STorus, params>(
-      v_stream, gpu_index, (Torus *)glwe_array_out_cmux_tree,
-      (Torus *)ggsw_out_cbs, lut_vector, cmux_tree_buffer, glwe_dimension,
-      polynomial_size, base_log_cbs, level_count_cbs, lut_vector_size, tau,
-      max_shared_memory);
-  check_cuda_error(cudaGetLastError());
-
-  // Blind rotation + sample extraction
-  // mbr = tau * p - r = log2(N)
-  // br_ggsw is a pointer to a sub-part of the ggsw_out_cbs buffer, for the
-  // blind rotation
-  uint32_t cmux_ggsw_len =
-      max(0, (int)number_of_inputs - (int)params::log2_degree);
-
-  Torus *br_ggsw =
-      (Torus *)ggsw_out_cbs +
-      (ptrdiff_t)(cmux_ggsw_len * level_count_cbs * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) * polynomial_size);
-  host_blind_rotate_and_sample_extraction<Torus, STorus, params>(
-      v_stream, gpu_index, lwe_array_out, br_ggsw,
-      (Torus *)glwe_array_out_cmux_tree, br_se_buffer, mbr_size, tau,
-      glwe_dimension, polynomial_size, base_log_cbs, level_count_cbs,
-      max_shared_memory);
-}
-
-template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_wop_pbs(
-    uint32_t lwe_dimension, uint32_t total_bits_of_crt_decomposition) {
-
-  uint64_t buffer_size = (lwe_dimension + 1) *
-                         (total_bits_of_crt_decomposition) *
-                         sizeof(Torus); // lwe_array_out_bit_extract
-  return buffer_size + buffer_size % sizeof(double2);
-}
-
-template <typename Torus, typename STorus, typename params>
-__host__ void scratch_wop_pbs(
-    void *v_stream, uint32_t gpu_index, int8_t **wop_pbs_buffer,
-    uint32_t *delta_log_array, uint32_t *cbs_delta_log, uint32_t glwe_dimension,
-    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count_cbs,
-    uint32_t level_count_bsk, uint32_t *number_of_bits_to_extract_array,
-    uint32_t crt_decomposition_size, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-
-  cudaSetDevice(gpu_index);
-  auto stream = static_cast<cudaStream_t *>(v_stream);
-
-  uint32_t ciphertext_total_bits_count = sizeof(Torus) * 8;
-  int total_bits_to_extract = 0;
-  for (int i = 0; i < crt_decomposition_size; i++) {
-    total_bits_to_extract += number_of_bits_to_extract_array[i];
-    delta_log_array[i] =
-        ciphertext_total_bits_count - number_of_bits_to_extract_array[i];
-  }
-
-  uint64_t bit_extract_buffer_size =
-      get_buffer_size_extract_bits<Torus>(glwe_dimension, lwe_dimension,
-                                          polynomial_size,
-                                          crt_decomposition_size) +
-      get_buffer_size_bootstrap_fast_low_latency<Torus>(
-          glwe_dimension, polynomial_size, level_count_bsk,
-          crt_decomposition_size, max_shared_memory);
-  uint32_t cbs_vp_number_of_inputs = total_bits_to_extract;
-  uint32_t mbr_size =
-      std::min(params::log2_degree, (int)(total_bits_to_extract));
-  if (allocate_gpu_memory) {
-    uint64_t buffer_size =
-        bit_extract_buffer_size +
-        get_buffer_size_wop_pbs<Torus>(lwe_dimension, total_bits_to_extract) +
-        get_buffer_size_cbs_vp<Torus>(glwe_dimension, polynomial_size,
-                                      level_count_cbs, crt_decomposition_size,
-                                      cbs_vp_number_of_inputs) +
-        get_buffer_size_cbs<Torus>(glwe_dimension, lwe_dimension,
-                                   polynomial_size, level_count_cbs,
-                                   cbs_vp_number_of_inputs) +
-        get_buffer_size_bootstrap_low_latency<Torus>(
-            glwe_dimension, polynomial_size, level_count_bsk,
-            cbs_vp_number_of_inputs * level_count_cbs, max_shared_memory) +
-        get_buffer_size_cmux_tree<Torus, params>(
-            glwe_dimension, polynomial_size, level_count_cbs,
-            (1 << cbs_vp_number_of_inputs), crt_decomposition_size,
-            max_shared_memory) +
-        get_buffer_size_blind_rotation_sample_extraction<Torus>(
-            glwe_dimension, polynomial_size, level_count_cbs, mbr_size,
-            crt_decomposition_size, max_shared_memory);
-
-    *wop_pbs_buffer =
-        (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
-  }
-
-  int8_t *bit_extract_buffer =
-      (int8_t *)*wop_pbs_buffer + (ptrdiff_t)(get_buffer_size_wop_pbs<Torus>(
-                                      lwe_dimension, total_bits_to_extract));
-  scratch_extract_bits<Torus, STorus, params>(
-      v_stream, gpu_index, &bit_extract_buffer, glwe_dimension, lwe_dimension,
-      polynomial_size, level_count_bsk, crt_decomposition_size,
-      max_shared_memory, false);
-
-  int8_t *cbs_vp_buffer =
-      bit_extract_buffer + (ptrdiff_t)bit_extract_buffer_size;
-  scratch_circuit_bootstrap_vertical_packing<Torus, STorus, params>(
-      v_stream, gpu_index, &cbs_vp_buffer, cbs_delta_log, glwe_dimension,
-      lwe_dimension, polynomial_size, level_count_bsk, level_count_cbs,
-      total_bits_to_extract, crt_decomposition_size, max_shared_memory, false);
-}
-
-template <typename Torus, typename STorus, class params>
-__host__ void host_wop_pbs(
-    void *v_stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus *lwe_array_in, Torus *lut_vector, double2 *fourier_bsk, Torus *ksk,
-    Torus *cbs_fpksk, int8_t *wop_pbs_buffer, uint32_t cbs_delta_log,
-    uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log_bsk, uint32_t level_count_bsk, uint32_t base_log_ksk,
-    uint32_t level_count_ksk, uint32_t base_log_pksk, uint32_t level_count_pksk,
-    uint32_t base_log_cbs, uint32_t level_count_cbs,
-    uint32_t *number_of_bits_to_extract_array, uint32_t *delta_log_array,
-    uint32_t crt_decomposition_size, uint32_t max_shared_memory) {
-
-  int total_bits_to_extract = 0;
-  for (int i = 0; i < crt_decomposition_size; i++) {
-    total_bits_to_extract += number_of_bits_to_extract_array[i];
-  }
-  int8_t *bit_extract_buffer = wop_pbs_buffer;
-  int8_t *lwe_array_out_bit_extract =
-      bit_extract_buffer +
-      (ptrdiff_t)(get_buffer_size_extract_bits<Torus>(
-                      glwe_dimension, lwe_dimension, polynomial_size,
-                      crt_decomposition_size) +
-                  get_buffer_size_bootstrap_fast_low_latency<Torus>(
-                      glwe_dimension, polynomial_size, level_count_bsk,
-                      crt_decomposition_size, max_shared_memory));
-
-  host_extract_bits<Torus, params>(
-      v_stream, gpu_index, (Torus *)lwe_array_out_bit_extract, lwe_array_in,
-      bit_extract_buffer, ksk, fourier_bsk, number_of_bits_to_extract_array,
-      delta_log_array, glwe_dimension * polynomial_size, lwe_dimension,
-      glwe_dimension, polynomial_size, base_log_bsk, level_count_bsk,
-      base_log_ksk, level_count_ksk, crt_decomposition_size, max_shared_memory);
-  check_cuda_error(cudaGetLastError());
-
-  int8_t *cbs_vp_buffer =
-      lwe_array_out_bit_extract + (ptrdiff_t)(get_buffer_size_wop_pbs<Torus>(
-                                      lwe_dimension, total_bits_to_extract));
-  host_circuit_bootstrap_vertical_packing<Torus, STorus, params>(
-      v_stream, gpu_index, lwe_array_out, (Torus *)lwe_array_out_bit_extract,
-      lut_vector, fourier_bsk, cbs_fpksk, cbs_vp_buffer, cbs_delta_log,
-      glwe_dimension, lwe_dimension, polynomial_size, base_log_bsk,
-      level_count_bsk, base_log_pksk, level_count_pksk, base_log_cbs,
-      level_count_cbs, total_bits_to_extract, crt_decomposition_size,
-      max_shared_memory);
-  check_cuda_error(cudaGetLastError());
-}
-#endif // WOP_PBS_H
--- a/backends/concrete-cuda/implementation/test_and_benchmark/CMakeLists.txt
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/CMakeLists.txt
@@ -1,11 +0,0 @@
-option(CONCRETE_CUDA_BUILD_TESTS "Build the test tool" ON)
-option(CONCRETE_CUDA_BUILD_BENCHMARKS "Build the benchmark tool" ON)
-
-if(CONCRETE_CUDA_BUILD_TESTS)
-  message(STATUS "Building with Concrete CUDA test tool")
-  add_subdirectory(test)
-endif()
-if(CONCRETE_CUDA_BUILD_BENCHMARKS)
-  message(STATUS "Building with Concrete CUDA benchmark tool")
-  add_subdirectory(benchmark)
-endif()
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/CMakeLists.txt
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/CMakeLists.txt
@@ -1,93 +0,0 @@
-find_package(CUDA REQUIRED)
-find_package(CUDAToolkit REQUIRED)
-
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-
-# Disable the Google Benchmark requirement on Google Test
-set(BENCHMARK_ENABLE_GTEST_TESTS OFF)
-set(BENCHMARK_ENABLE_TESTING OFF)
-
-include(FetchContent)
-FetchContent_Declare(
-  googlebenchmark
-  GIT_REPOSITORY https://github.com/google/benchmark.git
-  GIT_TAG v1.7.1)
-FetchContent_MakeAvailable(googlebenchmark)
-
-# Enable ExternalProject CMake module
-include(ExternalProject)
-
-set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation/target/release")
-set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation")
-set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
-
-# Enable ExternalProject CMake module
-include(ExternalProject)
-
-if(NOT TARGET concrete_cpu)
-  ExternalProject_Add(
-    concrete_cpu
-    SOURCE_DIR ${CONCRETE_CPU_SOURCE_DIR}
-    DOWNLOAD_COMMAND ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND cargo +nightly build --release --features=nightly
-    COMMAND cargo +nightly build --release --features=nightly
-    BINARY_DIR ${CONCRETE_CPU_BINARY_DIR}
-    BUILD_ALWAYS true
-    INSTALL_COMMAND ""
-    LOG_BUILD ON)
-endif()
-
-set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/tfhe-rs")
-set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
-
-if(NOT TARGET tfhe-rs)
-  ExternalProject_Add(
-    tfhe-rs
-    GIT_REPOSITORY https://github.com/zama-ai/tfhe-rs.git
-    GIT_TAG main
-    SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
-    BUILD_IN_SOURCE 1
-    UPDATE_COMMAND ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND make build_c_api
-    INSTALL_COMMAND ""
-    LOG_BUILD ON)
-endif()
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
-include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
-include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
-include_directories(${TFHE_RS_BINARY_DIR})
-include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}")
-
-find_package(OpenMP REQUIRED)
-# Add the OpenMP flag to the compiler flags
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-
-add_library(concrete_cpu_lib STATIC IMPORTED)
-add_dependencies(concrete_cpu_lib concrete_cpu)
-set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
-
-add_library(tfhe_rs_lib STATIC IMPORTED)
-add_dependencies(tfhe_rs_lib tfhe-rs)
-set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed,--allow-multiple-definition -ldl")
-
-set(BINARY benchmark_concrete_cuda)
-
-file(
-  GLOB_RECURSE BENCH_SOURCES
-  LIST_DIRECTORIES false
-  benchmark*.cpp main.cpp)
-
-add_executable(${BINARY} ${BENCH_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
-
-set_target_properties(benchmark_concrete_cuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
-target_link_libraries(
-  benchmark_concrete_cuda
-  PUBLIC benchmark::benchmark concrete_cpu_lib tfhe_rs_lib concrete_cuda OpenMP::OpenMP_CXX
-  PRIVATE CUDA::cudart)
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/README.md
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/README.md
@@ -1,106 +0,0 @@
-# benchmark_concrete_cuda
-
-This benchmark tool is written over Google Benchmark library. It measures the performance of the concrete framework's CUDA-accelerated functions and helps identify potential bottlenecks. 
-The output format can be adjusted according to the user's interest. 
-
-Each benchmark is executed once and targets a single function. Internally, for each benchmark, the tool will repeat each targetted function as many times as it needs to report an execution time with sufficient reliability. At this point, the variation we've observed in the benchmarked functions is relatively small, and thus we chose not to repeat benchmarks by default. However, this can also be tuned by the user if needed.
-
-## How to Compile
-
-The first step in compiling code with CMake is to create a build directory. This directory will 
-contain all the files generated during the build process, such as object files and executables. 
-We recommend creating this directory outside of the source directory, but inside the 
-implementation folder,  to keep the source directory clean.
-
-```bash
-$ cd concrete/backends/concrete-cuda/implementation
-$ mkdir build
-$ cd build
-```
-
-Run CMake to generate the build files and then use make to compile the project.
-
-```bash
-$ cmake ..
-$ make
-```
-
-The binary will be found in `concrete/backends/concrete-cuda/implementation/build/test_and_benchmark
-/benchmark`.
-
-## How to Run Benchmarks
-
-To run benchmarks, you can simply execute the `benchmark_concrete_cuda` executable with no arguments:
-
-```bash
-$ test_and_benchmark/benchmark/benchmark_concrete_cuda
-```
-
-This will run all the available benchmarks.
-
-## Output format
-
-The reports will be printed to the standard output if you don't pass any arguments. However, Google Benchmarks has extended documentation on how to print it to files with different formats, e.g., `--benchmark_format=json` will print everything to a JSON file.
-
-## How to Filter Benchmarks
-
-You can filter benchmarks by specifying a regular expression as an argument. Only benchmarks whose name matches the regular expression will be executed.
-
-For example, to run only benchmarks whose name contains the word "Bootstrap", you can execute:
-
-```bash
-$ test_and_benchmark/benchmark/benchmark_concrete_cuda --benchmark_filter=Bootstrap
-```
-
-The parameter `--benchmark_list_tests` can be used to list all the available benchmarks. 
-
-## How to Set the Time Unit
-
-By default, benchmarks are reported in seconds. However, you can change the time unit to one of the following:
-
-* `ns` (nanoseconds)
-* `us` (microseconds)
-* `ms` (milliseconds)
-* `s` (seconds)
-
-To set the time unit, use the --benchmark_time_unit option followed by the desired time unit:
-
-```bash
-$ test_and_benchmark/benchmark/benchmark_concrete_cuda --benchmark_time_unit=us
-```
-
-## How to Set the Number of Iterations
-
-By default, each benchmark is executed for a number of iterations that is automatically determined by the Google Benchmark library. 
-However, you can increase the minimum time used for each measurement to increase the number of 
-iterations by using `--benchmark_min_time`. For instance:
-
-```bash
-$ test_and_benchmark/benchmark/benchmark_concrete_cuda --benchmark_min_time=10
-```
-
- will force the tool to run at least 10s of iterations.
-
-## Statistics about the benchmarks
-
-By default each benchmark will be executed only once. However, if you use 
-`--benchmark_repetitions` you can increase that and compute the mean, median, and standard 
-deviation of the benchmarks. 
-
-```bash
-$ test_and_benchmark/benchmark/benchmark_concrete_cuda --benchmark_repetitions=10
-```
-
-Doing this, for each run the execution time will be reported. If you prefer, you can use 
-`--benchmark_report_aggregates_only=true` to report only the statistical data, or 
-`--benchmark_display_aggregates_only=true` that will display in the standard output only the 
-statistical data but report everything in the output file. 
-
-## Known issues
-
-When displayed in the standard output, on a terminal, the unit presented for the throughput is given in "number of operations per second". This is a bug on the way data is presented by Google Benchmark. The correct unit is "operations per dollar".
-
-## Conclusion
-
-With these options, you can easily run benchmarks, filter benchmarks, set the time unit, and the number of iterations of benchmark_concrete_cuda. If you have any questions or issues, please feel free to contact us.
-To learn more about Google Benchmark library, please refer to the [official user guide](https://github.com/google/benchmark/blob/main/docs/user_guide.md). 
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_bit_extraction.cpp
@@ -1,141 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <setup_and_teardown.h>
-#include <vector>
-
-using namespace std;
-
-const unsigned MAX_INPUTS = 4;
-const unsigned SAMPLES = 1;
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  int pbs_base_log;
-  int pbs_level;
-  int ks_base_log;
-  int ks_level;
-  int number_of_inputs;
-  int number_of_bits_of_message_including_padding_0;
-  int number_of_bits_of_message_including_padding_1;
-  int number_of_bits_of_message_including_padding_2;
-  int number_of_bits_of_message_including_padding_3;
-  int number_of_bits_to_extract_0;
-  int number_of_bits_to_extract_1;
-  int number_of_bits_to_extract_2;
-  int number_of_bits_to_extract_3;
-} BitExtractionBenchmarkParams;
-
-class BitExtraction_u64 : public benchmark::Fixture {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance = 7.52316384526264e-37;
-  double glwe_modular_variance = 7.52316384526264e-37;
-  int pbs_base_log;
-  int pbs_level;
-  int ks_base_log;
-  int ks_level;
-  uint32_t number_of_bits_of_message_including_padding_array[MAX_INPUTS];
-  uint32_t number_of_bits_to_extract_array[MAX_INPUTS];
-  int number_of_inputs;
-  uint64_t delta_array[MAX_INPUTS];
-  uint32_t delta_log_array[MAX_INPUTS];
-  Csprng *csprng;
-  cudaStream_t *stream_array[SAMPLES];
-  int gpu_index = 0;
-  uint64_t *plaintexts;
-  double *d_fourier_bsk;
-  uint64_t *d_ksk;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *d_lwe_ct_out_array;
-  int8_t *bit_extract_buffer_array[SAMPLES];
-  uint64_t *lwe_sk_in;
-  uint64_t *lwe_sk_out;
-
-public:
-  // Test arithmetic functions
-  void SetUp(const ::benchmark::State &state) {
-    for (size_t i = 0; i < SAMPLES; i++) {
-      stream_array[i] = cuda_create_stream(0);
-    }
-
-    // TestParams
-    lwe_dimension = state.range(0);
-    glwe_dimension = state.range(1);
-    polynomial_size = state.range(2);
-    pbs_base_log = state.range(3);
-    pbs_level = state.range(4);
-    ks_base_log = state.range(5);
-    ks_level = state.range(6);
-
-    number_of_inputs = state.range(7);
-
-    for (int i = 0; i < number_of_inputs; i++) {
-      number_of_bits_of_message_including_padding_array[i] = state.range(8 + i);
-      number_of_bits_to_extract_array[i] = state.range(12 + i);
-    }
-
-    bit_extraction_setup(
-        stream_array, &csprng, &lwe_sk_in, &lwe_sk_out, &d_fourier_bsk, &d_ksk,
-        &plaintexts, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
-        bit_extract_buffer_array, lwe_dimension, glwe_dimension,
-        polynomial_size, lwe_modular_variance, glwe_modular_variance,
-        ks_base_log, ks_level, pbs_base_log, pbs_level,
-        number_of_bits_of_message_including_padding_array,
-        number_of_bits_to_extract_array, delta_log_array, delta_array,
-        number_of_inputs, 1, 1, gpu_index);
-  }
-
-  void TearDown(const ::benchmark::State &state) {
-    bit_extraction_teardown(stream_array, csprng, lwe_sk_in, lwe_sk_out,
-                            d_fourier_bsk, d_ksk, plaintexts, d_lwe_ct_in_array,
-                            d_lwe_ct_out_array, bit_extract_buffer_array,
-                            SAMPLES, gpu_index);
-  }
-};
-
-BENCHMARK_DEFINE_F(BitExtraction_u64, ConcreteCuda_BitExtraction)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    // Execute bit extract
-    cuda_extract_bits_64(
-        stream_array[0], gpu_index, (void *)d_lwe_ct_out_array,
-        (void *)d_lwe_ct_in_array, bit_extract_buffer_array[0], (void *)d_ksk,
-        (void *)d_fourier_bsk, number_of_bits_to_extract_array, delta_log_array,
-        glwe_dimension * polynomial_size, lwe_dimension, glwe_dimension,
-        polynomial_size, pbs_base_log, pbs_level, ks_base_log, ks_level,
-        number_of_inputs, cuda_get_max_shared_memory(gpu_index));
-    cuda_synchronize_stream((void *)stream_array[0]);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(number_of_inputs / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-static void
-BitExtractionBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
-
-  // Define the parameters to benchmark
-  std::vector<BitExtractionBenchmarkParams> params = {
-      (BitExtractionBenchmarkParams){585, 1, 1024, 10, 2, 4, 7, 4, 3, 4, 3, 3,
-                                     3, 4, 3, 3}};
-
-  // Add to the list of parameters to benchmark
-  for (auto x : params)
-    b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
-             x.pbs_base_log, x.pbs_level, x.ks_base_log, x.ks_level,
-             x.number_of_inputs,
-             x.number_of_bits_of_message_including_padding_0,
-             x.number_of_bits_of_message_including_padding_1,
-             x.number_of_bits_of_message_including_padding_2,
-             x.number_of_bits_of_message_including_padding_3,
-             x.number_of_bits_to_extract_0, x.number_of_bits_to_extract_1,
-             x.number_of_bits_to_extract_2, x.number_of_bits_to_extract_3});
-}
-
-BENCHMARK_REGISTER_F(BitExtraction_u64, ConcreteCuda_BitExtraction)
-    ->Apply(BitExtractionBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_circuit_bootstrap.cpp
@@ -1,122 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  int pbs_base_log;
-  int pbs_level;
-  int pksk_base_log;
-  int pksk_level;
-  int cbs_base_log;
-  int cbs_level;
-  int number_of_inputs;
-} CircuitBootstrapBenchmarkParams;
-
-class CircuitBootstrap_u64 : public benchmark::Fixture {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance = 7.52316384526264e-37;
-  double glwe_modular_variance = 7.52316384526264e-37;
-  int pbs_base_log;
-  int pbs_level;
-  int pksk_base_log;
-  int pksk_level;
-  int cbs_base_log;
-  int cbs_level;
-  int number_of_inputs;
-  int number_of_bits_of_message_including_padding;
-  int ggsw_size;
-  uint64_t delta;
-  int delta_log;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *lwe_sk_in;
-  uint64_t *lwe_sk_out;
-  uint64_t *plaintexts;
-  double *d_fourier_bsk;
-  uint64_t *d_pksk;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *d_ggsw_ct_out_array;
-  uint64_t *d_lut_vector_indexes;
-  int8_t *cbs_buffer;
-
-public:
-  // Test arithmetic functions
-  void SetUp(const ::benchmark::State &state) {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    lwe_dimension = state.range(0);
-    glwe_dimension = state.range(1);
-    polynomial_size = state.range(2);
-    pbs_base_log = state.range(3);
-    pbs_level = state.range(4);
-    pksk_base_log = state.range(5);
-    pksk_level = state.range(6);
-    cbs_base_log = state.range(7);
-    cbs_level = state.range(8);
-    number_of_inputs = state.range(9);
-
-    // We generate binary messages
-    number_of_bits_of_message_including_padding = 2;
-    ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
-                polynomial_size;
-    circuit_bootstrap_setup(
-        stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_fourier_bsk, &d_pksk,
-        &plaintexts, &d_lwe_ct_in_array, &d_ggsw_ct_out_array,
-        &d_lut_vector_indexes, &cbs_buffer, lwe_dimension, glwe_dimension,
-        polynomial_size, lwe_modular_variance, glwe_modular_variance,
-        pksk_base_log, pksk_level, pbs_base_log, pbs_level, cbs_level,
-        number_of_bits_of_message_including_padding, ggsw_size, &delta_log,
-        &delta, number_of_inputs, 1, 1, gpu_index);
-  }
-
-  void TearDown(const ::benchmark::State &state) {
-    circuit_bootstrap_teardown(stream, csprng, lwe_sk_in, lwe_sk_out,
-                               d_fourier_bsk, d_pksk, plaintexts,
-                               d_lwe_ct_in_array, d_lut_vector_indexes,
-                               d_ggsw_ct_out_array, cbs_buffer, gpu_index);
-  }
-};
-
-BENCHMARK_DEFINE_F(CircuitBootstrap_u64, ConcreteCuda_CircuitBootstrap)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    // Execute circuit bootstrap
-    cuda_circuit_bootstrap_64(
-        stream, gpu_index, (void *)d_ggsw_ct_out_array,
-        (void *)d_lwe_ct_in_array, (void *)d_fourier_bsk, (void *)d_pksk,
-        (void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-        glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
-        pksk_base_log, cbs_level, cbs_base_log, number_of_inputs,
-        cuda_get_max_shared_memory(gpu_index));
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(number_of_inputs / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-static void
-CircuitBootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
-  // Define the parameters to benchmark
-  std::vector<CircuitBootstrapBenchmarkParams> params = {
-      (CircuitBootstrapBenchmarkParams){10, 2, 512, 11, 2, 15, 2, 10, 1, 100}};
-
-  // Add to the list of parameters to benchmark
-  for (auto x : params)
-    b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
-             x.pbs_base_log, x.pbs_level, x.pksk_base_log, x.pksk_level,
-             x.cbs_base_log, x.cbs_level, x.number_of_inputs});
-}
-
-BENCHMARK_REGISTER_F(CircuitBootstrap_u64, ConcreteCuda_CircuitBootstrap)
-    ->Apply(CircuitBootstrapBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_classical_pbs.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_classical_pbs.cpp
@@ -1,226 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <omp.h>
-#include <setup_and_teardown.h>
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  int pbs_base_log;
-  int pbs_level;
-  int input_lwe_ciphertext_count;
-} BootstrapBenchmarkParams;
-
-class Bootstrap_u64 : public benchmark::Fixture {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  int input_lwe_ciphertext_count;
-  double lwe_modular_variance = 0.000007069849454709433;
-  double glwe_modular_variance = 0.00000000000000029403601535432533;
-  int pbs_base_log;
-  int pbs_level;
-  int message_modulus = 4;
-  int carry_modulus = 4;
-  int payload_modulus;
-  uint64_t delta;
-  std::vector<double *> d_fourier_bsk_array;
-  std::vector<uint64_t *> d_lut_pbs_identity;
-  std::vector<uint64_t *> d_lut_pbs_indexes;
-  std::vector<uint64_t *> d_lwe_ct_in_array;
-  std::vector<uint64_t *> d_lwe_ct_out_array;
-  uint64_t *lwe_ct_array;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *plaintexts;
-  Csprng *csprng;
-  std::vector<int8_t *> pbs_buffer;
-  int num_gpus;
-  std::vector<cudaStream_t *> streams;
-  std::vector<int> input_lwe_ciphertext_count_per_gpu;
-
-public:
-  void SetUp(const ::benchmark::State &state) {
-    lwe_dimension = state.range(0);
-    glwe_dimension = state.range(1);
-    polynomial_size = state.range(2);
-    pbs_base_log = state.range(3);
-    pbs_level = state.range(4);
-    input_lwe_ciphertext_count = state.range(5);
-
-    num_gpus = std::min(cuda_get_number_of_gpus(), input_lwe_ciphertext_count);
-
-    for (int gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
-      cudaSetDevice(gpu_index);
-      cudaStream_t *stream = cuda_create_stream(gpu_index);
-      streams.push_back(stream);
-      int input_lwe_ciphertext_count_on_gpu = number_of_inputs_on_gpu(
-          gpu_index, input_lwe_ciphertext_count, num_gpus);
-
-      double *d_fourier_bsk_array_per_gpu;
-      uint64_t *d_lut_pbs_identity_per_gpu;
-      uint64_t *d_lut_pbs_indexes_per_gpu;
-      uint64_t *d_lwe_ct_in_array_per_gpu;
-      uint64_t *d_lwe_ct_out_array_per_gpu;
-      int8_t *pbs_buffer_per_gpu;
-
-      bootstrap_classical_setup(
-          stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
-          &d_fourier_bsk_array_per_gpu, &plaintexts,
-          &d_lut_pbs_identity_per_gpu, &d_lut_pbs_indexes_per_gpu,
-          &d_lwe_ct_in_array_per_gpu, &d_lwe_ct_out_array_per_gpu,
-          lwe_dimension, glwe_dimension, polynomial_size, lwe_modular_variance,
-          glwe_modular_variance, pbs_base_log, pbs_level, message_modulus,
-          carry_modulus, &payload_modulus, &delta,
-          input_lwe_ciphertext_count_on_gpu, 1, 1, gpu_index);
-      size_t free, total;
-      cudaMemGetInfo(&free, &total);
-      uint64_t buffer_size = get_buffer_size_bootstrap_low_latency_64(
-          glwe_dimension, polynomial_size, pbs_level,
-          input_lwe_ciphertext_count_on_gpu,
-          cuda_get_max_shared_memory(gpu_index));
-
-      assert(buffer_size > free);
-      scratch_cuda_bootstrap_low_latency_64(
-          stream, gpu_index, &pbs_buffer_per_gpu, glwe_dimension,
-          polynomial_size, pbs_level, input_lwe_ciphertext_count_on_gpu,
-          cuda_get_max_shared_memory(gpu_index), true);
-
-      d_fourier_bsk_array.push_back(d_fourier_bsk_array_per_gpu);
-      d_lut_pbs_identity.push_back(d_lut_pbs_identity_per_gpu);
-      d_lut_pbs_indexes.push_back(d_lut_pbs_indexes_per_gpu);
-      d_lwe_ct_in_array.push_back(d_lwe_ct_in_array_per_gpu);
-      d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu);
-      pbs_buffer.push_back(pbs_buffer_per_gpu);
-      input_lwe_ciphertext_count_per_gpu.push_back(
-          input_lwe_ciphertext_count_on_gpu);
-    }
-
-    // We keep the following for the benchmarks with copies
-    lwe_ct_array = (uint64_t *)malloc(
-        (lwe_dimension + 1) * input_lwe_ciphertext_count * sizeof(uint64_t));
-  }
-
-  void TearDown(const ::benchmark::State &state) {
-    concrete_cpu_destroy_concrete_csprng(csprng);
-    free(csprng);
-    free(lwe_sk_in_array);
-    free(lwe_sk_out_array);
-    free(plaintexts);
-
-    for (int gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
-      cudaSetDevice(gpu_index);
-      cleanup_cuda_bootstrap_low_latency(streams[gpu_index], gpu_index,
-                                         &pbs_buffer[gpu_index]);
-      cuda_drop_async(d_fourier_bsk_array[gpu_index], streams[gpu_index],
-                      gpu_index);
-      cuda_drop_async(d_lut_pbs_identity[gpu_index], streams[gpu_index],
-                      gpu_index);
-      cuda_drop_async(d_lut_pbs_indexes[gpu_index], streams[gpu_index],
-                      gpu_index);
-      cuda_drop_async(d_lwe_ct_in_array[gpu_index], streams[gpu_index],
-                      gpu_index);
-      cuda_drop_async(d_lwe_ct_out_array[gpu_index], streams[gpu_index],
-                      gpu_index);
-      cuda_synchronize_stream(streams[gpu_index]);
-      cuda_destroy_stream(streams[gpu_index], gpu_index);
-    }
-    d_fourier_bsk_array.clear();
-    d_lut_pbs_identity.clear();
-    d_lut_pbs_indexes.clear();
-    d_lwe_ct_in_array.clear();
-    d_lwe_ct_out_array.clear();
-    pbs_buffer.clear();
-    input_lwe_ciphertext_count_per_gpu.clear();
-    streams.clear();
-    cudaDeviceReset();
-  }
-};
-
-BENCHMARK_DEFINE_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
-(benchmark::State &st) {
-
-  for (auto _ : st) {
-#pragma omp parallel for
-    for (int gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
-      // Execute PBS
-      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
-          streams[gpu_index], gpu_index, (void *)d_lwe_ct_out_array[gpu_index],
-          (void *)d_lut_pbs_identity[gpu_index],
-          (void *)d_lut_pbs_indexes[gpu_index],
-          (void *)d_lwe_ct_in_array[gpu_index],
-          (void *)d_fourier_bsk_array[gpu_index], pbs_buffer[gpu_index],
-          lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, input_lwe_ciphertext_count_per_gpu[gpu_index], 1, 0,
-          cuda_get_max_shared_memory(gpu_index));
-    }
-    for (int gpu_index = 0; gpu_index < num_gpus; gpu_index++) {
-      cudaSetDevice(gpu_index);
-      cuda_synchronize_stream(streams[gpu_index]);
-    }
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(input_lwe_ciphertext_count / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-static void
-BootstrapBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
-  // Define the parameters to benchmark
-  // lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-  // input_lwe_ciphertext_count
-  std::vector<BootstrapBenchmarkParams> params = {
-      // BOOLEAN_DEFAULT_PARAMETERS
-      (BootstrapBenchmarkParams){777, 3, 512, 18, 1, 1},
-      (BootstrapBenchmarkParams){777, 3, 512, 18, 1, 1000},
-      // BOOLEAN_TFHE_LIB_PARAMETERS
-      (BootstrapBenchmarkParams){830, 2, 1024, 23, 1, 1},
-      (BootstrapBenchmarkParams){830, 2, 1024, 23, 1, 1000},
-      // SHORTINT_PARAM_MESSAGE_1_CARRY_0
-      (BootstrapBenchmarkParams){678, 5, 256, 15, 1, 1},
-      (BootstrapBenchmarkParams){678, 5, 256, 15, 1, 1000},
-      // SHORTINT_PARAM_MESSAGE_1_CARRY_1
-      (BootstrapBenchmarkParams){684, 3, 512, 18, 1, 1},
-      (BootstrapBenchmarkParams){684, 3, 512, 18, 1, 1000},
-      // SHORTINT_PARAM_MESSAGE_2_CARRY_0
-      (BootstrapBenchmarkParams){656, 2, 512, 8, 2, 1},
-      (BootstrapBenchmarkParams){656, 2, 512, 8, 2, 1000},
-      // SHORTINT_PARAM_MESSAGE_1_CARRY_2
-      // SHORTINT_PARAM_MESSAGE_2_CARRY_1
-      // SHORTINT_PARAM_MESSAGE_3_CARRY_0
-      (BootstrapBenchmarkParams){742, 2, 1024, 23, 1, 1},
-      (BootstrapBenchmarkParams){742, 2, 1024, 23, 1, 1000},
-      // SHORTINT_PARAM_MESSAGE_1_CARRY_3
-      // SHORTINT_PARAM_MESSAGE_2_CARRY_2
-      // SHORTINT_PARAM_MESSAGE_3_CARRY_1
-      // SHORTINT_PARAM_MESSAGE_4_CARRY_0
-      (BootstrapBenchmarkParams){745, 1, 2048, 23, 1, 1},
-      (BootstrapBenchmarkParams){745, 1, 2048, 23, 1, 1000},
-      // SHORTINT_PARAM_MESSAGE_5_CARRY_0
-      // SHORTINT_PARAM_MESSAGE_3_CARRY_2
-      (BootstrapBenchmarkParams){807, 1, 4096, 22, 1, 1},
-      (BootstrapBenchmarkParams){807, 1, 4096, 22, 1, 1000},
-      // SHORTINT_PARAM_MESSAGE_6_CARRY_0
-      (BootstrapBenchmarkParams){915, 1, 8192, 22, 1, 1},
-      (BootstrapBenchmarkParams){915, 1, 8192, 22, 1, 100},
-      // SHORTINT_PARAM_MESSAGE_3_CARRY_3
-      //(BootstrapBenchmarkParams){864, 1, 8192, 15, 2, 100},
-      // SHORTINT_PARAM_MESSAGE_4_CARRY_3
-      // SHORTINT_PARAM_MESSAGE_7_CARRY_0
-      (BootstrapBenchmarkParams){930, 1, 16384, 15, 2, 1},
-      (BootstrapBenchmarkParams){930, 1, 16384, 15, 2, 100},
-  };
-
-  // Add to the list of parameters to benchmark
-  for (auto x : params) {
-    b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
-             x.pbs_base_log, x.pbs_level, x.input_lwe_ciphertext_count});
-  }
-}
-
-BENCHMARK_REGISTER_F(Bootstrap_u64, ConcreteCuda_LowLatencyPBS)
-    ->Apply(BootstrapBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_cmux_tree.cpp
@@ -1,96 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <functional>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct {
-  int glwe_dimension;
-  int polynomial_size;
-  int p;
-  int tau;
-  int base_log;
-  int level_count;
-} CMUXTreeBenchmarkParams;
-
-class CMUXTree_u64 : public benchmark::Fixture {
-protected:
-  int glwe_dimension;
-  int polynomial_size;
-  int p;
-  int tau;
-  double glwe_modular_variance = 0.00000000000000029403601535432533;
-  int base_log;
-  int level_count;
-  uint64_t delta;
-  uint32_t delta_log = 60;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *d_lut_identity;
-  uint64_t *d_ggsw_bit_array;
-  uint64_t *plaintexts;
-  uint64_t *d_glwe_out;
-  uint64_t *glwe_sk;
-  int8_t *cmux_tree_buffer = nullptr;
-
-public:
-  // Test arithmetic functions
-  void SetUp(const ::benchmark::State &state) {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    glwe_dimension = state.range(0);
-    polynomial_size = state.range(1);
-    p = state.range(2);
-    tau = state.range(3);
-    base_log = state.range(4);
-    level_count = state.range(5);
-
-    cmux_tree_setup(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
-                    &d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
-                    glwe_dimension, polynomial_size, base_log, level_count,
-                    glwe_modular_variance, p, tau, &delta_log, 1, 1, gpu_index);
-
-    // Value of the shift we multiply our messages by
-    delta = ((uint64_t)(1) << delta_log);
-  }
-
-  void TearDown(const ::benchmark::State &state) {
-    cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
-                       &d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
-                       gpu_index);
-  }
-};
-
-BENCHMARK_DEFINE_F(CMUXTree_u64, ConcreteCuda_CMUXTree)(benchmark::State &st) {
-  for (auto _ : st) {
-    // Execute scratch/CMUX tree/cleanup
-    cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
-                      (void *)d_ggsw_bit_array, (void *)d_lut_identity,
-                      cmux_tree_buffer, glwe_dimension, polynomial_size,
-                      base_log, level_count, (1 << (tau * p)), tau,
-                      cuda_get_max_shared_memory(gpu_index));
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(tau * p / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-static void CMUXTreeBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
-  // Define the parameters to benchmark
-  std::vector<CMUXTreeBenchmarkParams> params = {
-      // glwe_dimension, polynomial_size, p, tau, base_log, level_count,
-      (CMUXTreeBenchmarkParams){2, 256, 10, 4, 6, 3},
-  };
-
-  // Add to the list of parameters to benchmark
-  for (auto x : params)
-    b->Args({x.glwe_dimension, x.polynomial_size, x.p, x.tau, x.base_log,
-             x.level_count});
-}
-
-BENCHMARK_REGISTER_F(CMUXTree_u64, ConcreteCuda_CMUXTree)
-    ->Apply(CMUXTreeBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_integer_multiplication.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_integer_multiplication.cpp
@@ -1,313 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <setup_and_teardown.h>
-#include <omp.h>
-
-const bool USE_MULTI_GPU = false;
-
-typedef struct {
-    int lwe_dimension;
-    int glwe_dimension;
-    int polynomial_size;
-    double lwe_modular_variance;
-    double glwe_modular_variance;
-    int pbs_base_log;
-    int pbs_level;
-    int ksk_base_log;
-    int ksk_level;
-    int total_message_bits;
-    int number_of_blocks;
-    int message_modulus;
-    int carry_modulus;
-    PBS_TYPE pbs_type;
-} IntegerMultiplicationBenchmarkParams;
-
-class IntegerMultiplication_u64 : public benchmark::Fixture {
-protected:
-    int lwe_dimension;
-    int glwe_dimension;
-    int polynomial_size;
-    double lwe_modular_variance = 4.478453795193731e-11;
-    double glwe_modular_variance = 8.645717832544903e-32;
-    int pbs_base_log;
-    int pbs_level;
-    int ksk_base_log;
-    int ksk_level;
-    int message_modulus;
-    int carry_modulus;
-    int total_message_bits;
-    int number_of_blocks;
-    int payload_modulus;
-    PBS_TYPE pbs_type;
-    uint64_t delta;
-
-    std::vector<void *> d_bsk_array;
-    std::vector<uint64_t *> d_ksk_array;
-    std::vector<uint64_t *> d_lwe_ct_in_array_1;
-    std::vector<uint64_t *> d_lwe_ct_in_array_2;
-    std::vector<uint64_t *> d_lwe_ct_out_array;
-    uint64_t *lwe_sk_in;
-    uint64_t *lwe_sk_out;
-    uint64_t *plaintexts_1;
-    uint64_t *plaintexts_2;
-    std::vector<int_mul_memory<uint64_t> *> mem_ptr_array;
-
-    Csprng *csprng;
-    int max_gpus_to_use;
-    int operations_per_gpu;
-
-    int num_gpus;
-
-public:
-    void SetUp(const ::benchmark::State &state) {
-        cudaDeviceSynchronize();
-
-        lwe_dimension = state.range(0);
-        glwe_dimension = state.range(1);
-        polynomial_size = state.range(2);
-        lwe_modular_variance = state.range(3);
-        glwe_modular_variance = state.range(4);
-        pbs_base_log = state.range(5);
-        pbs_level = state.range(6);
-        ksk_base_log = state.range(7);
-        ksk_level = state.range(8);
-        total_message_bits = state.range(9);
-        number_of_blocks = state.range(10);
-        message_modulus = state.range(11);
-        carry_modulus = state.range(12);
-        int pbs_type_int = state.range(13);
-        max_gpus_to_use = state.range(14);
-        operations_per_gpu = state.range(15);
-
-        pbs_type = static_cast<PBS_TYPE>(pbs_type_int);
-
-        num_gpus = std::min(cuda_get_number_of_gpus(), max_gpus_to_use);
-
-        for (int device = 0; device < num_gpus; device++) {
-            cudaSetDevice(device);
-            cudaStream_t *stream = cuda_create_stream(device);
-
-            void *d_bsk_array_per_gpu;
-            uint64_t *d_ksk_array_per_gpu;
-            uint64_t *d_lwe_ct_in_array_1_per_gpu;
-            uint64_t *d_lwe_ct_in_array_2_per_gpu;
-            uint64_t *d_lwe_ct_out_array_per_gpu;
-            int_mul_memory<uint64_t> *mem_ptr_per_gpu = new int_mul_memory<uint64_t>;
-
-            integer_multiplication_setup(
-                    stream, &csprng, &lwe_sk_in, &lwe_sk_out,
-                    &d_bsk_array_per_gpu, &d_ksk_array_per_gpu,
-                    &plaintexts_1, &plaintexts_2, &d_lwe_ct_in_array_1_per_gpu,
-                    &d_lwe_ct_in_array_2_per_gpu, &d_lwe_ct_out_array_per_gpu,
-                    mem_ptr_per_gpu, lwe_dimension, glwe_dimension, polynomial_size,
-                    lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
-                    ksk_base_log, ksk_level, total_message_bits, number_of_blocks,
-                    message_modulus, carry_modulus, &delta, 1, 1, pbs_type, device);
-
-            if (USE_MULTI_GPU) {
-                scratch_cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
-                        mem_ptr_per_gpu, d_bsk_array_per_gpu, d_ksk_array_per_gpu,
-                        message_modulus, carry_modulus, glwe_dimension, lwe_dimension,
-                        polynomial_size, pbs_base_log, pbs_level, ksk_base_log, ksk_level,
-                        number_of_blocks, pbs_type, cuda_get_max_shared_memory(device),
-                        true);
-
-            } else {
-                scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-                        stream, device, (void *)mem_ptr_per_gpu, message_modulus,
-                        carry_modulus, glwe_dimension, lwe_dimension, polynomial_size,
-                        pbs_base_log, pbs_level, ksk_base_log, ksk_level, number_of_blocks,
-                        pbs_type, cuda_get_max_shared_memory(device), true);
-            }
-
-            d_bsk_array.push_back(d_bsk_array_per_gpu);
-            d_ksk_array.push_back(d_ksk_array_per_gpu);
-            d_lwe_ct_in_array_1.push_back(d_lwe_ct_in_array_1_per_gpu);
-            d_lwe_ct_in_array_2.push_back(d_lwe_ct_in_array_2_per_gpu);
-            d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu);
-            mem_ptr_array.push_back(mem_ptr_per_gpu);
-
-            cuda_synchronize_stream(stream);
-            cuda_destroy_stream(stream, device);
-        }
-    }
-
-    void TearDown(const ::benchmark::State &state) {
-        cudaDeviceSynchronize();
-        concrete_cpu_destroy_concrete_csprng(csprng);
-        free(csprng);
-        free(lwe_sk_in);
-        free(lwe_sk_out);
-        free(plaintexts_1);
-        free(plaintexts_2);
-
-        for (int device = 0; device < num_gpus; device++) {
-            cudaSetDevice(device);
-            cudaStream_t *stream = cuda_create_stream(device);
-            cuda_drop_async(d_bsk_array[device], stream, device);
-            cuda_drop_async(d_ksk_array[device], stream, device);
-            cuda_drop_async(d_lwe_ct_in_array_1[device], stream, device);
-            cuda_drop_async(d_lwe_ct_in_array_2[device], stream, device);
-            cuda_drop_async(d_lwe_ct_out_array[device], stream, device);
-
-            int_mul_memory<uint64_t> *mem_ptr = mem_ptr_array[device];
-
-            cuda_drop_async(mem_ptr->vector_result_sb, stream, 0);
-            cuda_drop_async(mem_ptr->block_mul_res, stream, 0);
-            cuda_drop_async(mem_ptr->small_lwe_vector, stream, 0);
-            cuda_drop_async(mem_ptr->lwe_pbs_out_array, stream, 0);
-            cuda_drop_async(mem_ptr->test_vector_array, stream, 0);
-            cuda_drop_async(mem_ptr->message_acc, stream, 0);
-            cuda_drop_async(mem_ptr->carry_acc, stream, 0);
-            cuda_drop_async(mem_ptr->test_vector_indexes, stream, 0);
-            cuda_drop_async(mem_ptr->tvi_message, stream, 0);
-            cuda_drop_async(mem_ptr->tvi_carry, stream, 0);
-            cuda_drop_async(mem_ptr->pbs_buffer, stream, 0);
-            for (int i = 0; i < mem_ptr->p2p_gpu_count; i++) {
-                cuda_drop_async(mem_ptr->device_to_device_buffer[i], mem_ptr->streams[i],
-                                i);
-                cuda_drop_async(mem_ptr->pbs_buffer_multi_gpu[i], mem_ptr->streams[i], i);
-                cuda_drop_async(mem_ptr->pbs_input_multi_gpu[i], mem_ptr->streams[i], i);
-                cuda_drop_async(mem_ptr->pbs_output_multi_gpu[i], mem_ptr->streams[i], i);
-                cuda_drop_async(mem_ptr->test_vector_multi_gpu[i], mem_ptr->streams[i], i);
-                cuda_drop_async(mem_ptr->tvi_lsb_multi_gpu[i], mem_ptr->streams[i], i);
-                cuda_drop_async(mem_ptr->tvi_msb_multi_gpu[i], mem_ptr->streams[i], i);
-                cuda_drop_async(mem_ptr->tvi_message_multi_gpu[i], mem_ptr->streams[i], i);
-                cuda_drop_async(mem_ptr->tvi_carry_multi_gpu[i], mem_ptr->streams[i], i);
-                if (i) {
-                    cuda_drop_async(mem_ptr->bsk_multi_gpu[i], mem_ptr->streams[i], i);
-                    cuda_drop_async(mem_ptr->ksk_multi_gpu[i], mem_ptr->streams[i], i);
-                }
-                cuda_destroy_stream(mem_ptr->streams[i], i);
-            }
-
-            cuda_synchronize_stream(stream);
-            cuda_destroy_stream(stream, device);
-        }
-
-        d_bsk_array.clear();
-        d_ksk_array.clear();
-        d_lwe_ct_in_array_1.clear();
-        d_lwe_ct_in_array_2.clear();
-        d_lwe_ct_out_array.clear();
-        mem_ptr_array.clear();
-        cudaDeviceReset();
-    }
-};
-
-BENCHMARK_DEFINE_F(IntegerMultiplication_u64,
-        ConcreteCuda_IntegerMultiplication)
-(benchmark::State &st) {
-int8_t *mult_buffer;
-uint32_t ct_degree_out = 0;
-uint32_t ct_degree_left = 0;
-uint32_t ct_degree_right = 0;
-
-omp_set_nested(true);
-
-for (auto _ : st) {
-// Execute multiplication
-#pragma omp parallel for num_threads(num_gpus)
-for (int device = 0; device < num_gpus; device++) {
-cudaSetDevice(device);
-
-auto d_lwe_ct_out = d_lwe_ct_out_array[device];
-auto d_lwe_ct_in_1 = d_lwe_ct_in_array_1[device];
-auto d_lwe_ct_in_2 = d_lwe_ct_in_array_2[device];
-auto d_bsk = d_bsk_array[device];
-auto d_ksk = d_ksk_array[device];
-auto mem_ptr = mem_ptr_array[device];
-
-#pragma omp parallel for num_threads(operations_per_gpu)
-for (int i = 0; i < operations_per_gpu; i++) {
-cudaStream_t *stream = cuda_create_stream(device);
-if (USE_MULTI_GPU) {
-cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
-(void *)d_lwe_ct_out, (void *)d_lwe_ct_in_1,
-(void *)d_lwe_ct_in_2, &ct_degree_out, &ct_degree_left,
-&ct_degree_right, d_bsk, d_ksk, (void *)mem_ptr, message_modulus,
-carry_modulus, glwe_dimension, lwe_dimension, polynomial_size,
-pbs_base_log, pbs_level, ksk_base_log, ksk_level,
-number_of_blocks, pbs_type, cuda_get_max_shared_memory(device));
-} else {
-cuda_integer_mult_radix_ciphertext_kb_64(
-        stream, device, (void *)d_lwe_ct_out, (void *)d_lwe_ct_in_1,
-(void *)d_lwe_ct_in_2, &ct_degree_out, &ct_degree_left,
-&ct_degree_right, d_bsk, d_ksk, (void *)mem_ptr, message_modulus,
-carry_modulus, glwe_dimension, lwe_dimension, polynomial_size,
-pbs_base_log, pbs_level, ksk_base_log, ksk_level,
-number_of_blocks, pbs_type, cuda_get_max_shared_memory(device));
-}
-cuda_synchronize_stream(stream);
-cuda_destroy_stream(stream, device);
-}
-}
-}
-}
-
-static void IntegerMultiplicationBenchmarkGenerateParams(
-        benchmark::internal::Benchmark *b) {
-    // Define the parameters to benchmark
-    std::vector<IntegerMultiplicationBenchmarkParams> params = {
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 8, 4, 4, 4, LOW_LAT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 16, 8, 4, 4, LOW_LAT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 32, 16, 4, 4, LOW_LAT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 40, 20, 4, 4, LOW_LAT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 64, 32, 4, 4, LOW_LAT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 128, 64, 4, 4, LOW_LAT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 256, 128, 4, 4, LOW_LAT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 8, 4, 4, 4, MULTI_BIT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 16, 8, 4, 4, MULTI_BIT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 32, 16, 4, 4, MULTI_BIT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 40, 20, 4, 4, MULTI_BIT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 64, 32, 4, 4, MULTI_BIT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 128, 64, 4, 4, MULTI_BIT},
-            (IntegerMultiplicationBenchmarkParams){
-                    744, 1, 2048, 4.478453795193731e-11, 8.645717832544903e-32, 23, 1, 3,
-                    5, 256, 128, 4, 4, MULTI_BIT},
-    };
-
-    int max_gpus_to_use = 8;
-
-    // Add to the list of parameters to benchmark
-    for(int operations_per_gpu = 1; operations_per_gpu < 10; operations_per_gpu++)
-        for (auto x : params) {
-            b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
-                     x.lwe_modular_variance, x.glwe_modular_variance, x.pbs_base_log,
-                     x.pbs_level, x.ksk_base_log, x.ksk_level, x.total_message_bits,
-                     x.number_of_blocks, x.message_modulus, x.carry_modulus,
-                     x.pbs_type, max_gpus_to_use, operations_per_gpu});
-        }
-}
-
-BENCHMARK_REGISTER_F(IntegerMultiplication_u64,
-        ConcreteCuda_IntegerMultiplication)
->Apply(IntegerMultiplicationBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_keyswitch.cpp
@@ -1,127 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct {
-  int input_lwe_dimension;
-  int output_lwe_dimension;
-  int ksk_base_log;
-  int ksk_level;
-  int number_of_inputs;
-} KeyswitchBenchmarkParams;
-
-class Keyswitch_u64 : public benchmark::Fixture {
-protected:
-  int input_lwe_dimension;
-  int output_lwe_dimension;
-  double noise_variance = 2.9802322387695312e-08;
-  int ksk_base_log;
-  int ksk_level;
-  int message_modulus = 4;
-  int carry_modulus = 4;
-  int payload_modulus;
-  int number_of_inputs;
-  uint64_t delta;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *plaintexts;
-  uint64_t *d_ksk_array;
-  uint64_t *d_lwe_out_ct_array;
-  uint64_t *d_lwe_in_ct_array;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-
-public:
-  // Test arithmetic functions
-  void SetUp(const ::benchmark::State &state) {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    input_lwe_dimension = state.range(0);
-    output_lwe_dimension = state.range(1);
-    ksk_base_log = state.range(2);
-    ksk_level = state.range(3);
-    number_of_inputs = state.range(4);
-
-    keyswitch_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
-                    &d_ksk_array, &plaintexts, &d_lwe_in_ct_array,
-                    &d_lwe_out_ct_array, input_lwe_dimension,
-                    output_lwe_dimension, noise_variance, ksk_base_log,
-                    ksk_level, message_modulus, carry_modulus, &payload_modulus,
-                    &delta, number_of_inputs, 1, 1, gpu_index);
-  }
-
-  void TearDown(const ::benchmark::State &state) {
-    keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
-                       d_ksk_array, plaintexts, d_lwe_in_ct_array,
-                       d_lwe_out_ct_array, gpu_index);
-  }
-};
-
-BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_Keyswitch)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    // Execute keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct_array,
-        (void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
-        output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(number_of_inputs / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
-(benchmark::State &st) {
-  uint64_t *lwe_in_ct = (uint64_t *)malloc(
-      number_of_inputs * (input_lwe_dimension + 1) * sizeof(uint64_t));
-  uint64_t *lwe_out_ct = (uint64_t *)malloc(
-      number_of_inputs * (output_lwe_dimension + 1) * sizeof(uint64_t));
-  void *v_stream = (void *)stream;
-  for (auto _ : st) {
-    cuda_memcpy_async_to_gpu(d_lwe_in_ct_array, lwe_in_ct,
-                             number_of_inputs * (input_lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    // Execute keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct_array,
-        (void *)d_lwe_in_ct_array, (void *)d_ksk_array, input_lwe_dimension,
-        output_lwe_dimension, ksk_base_log, ksk_level, number_of_inputs);
-    cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct_array,
-                             number_of_inputs * (output_lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_synchronize_stream(v_stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(number_of_inputs / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-  free(lwe_in_ct);
-  free(lwe_out_ct);
-}
-
-static void
-KeyswitchBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
-  // Define the parameters to benchmark
-  // na, nb, base_log, level, number_of_inputs
-  std::vector<KeyswitchBenchmarkParams> params = {
-      (KeyswitchBenchmarkParams){600, 1024, 3, 8, 1000},
-  };
-
-  // Add to the list of parameters to benchmark
-  for (auto x : params)
-    b->Args({x.input_lwe_dimension, x.output_lwe_dimension, x.ksk_base_log,
-             x.ksk_level, x.number_of_inputs});
-}
-
-BENCHMARK_REGISTER_F(Keyswitch_u64, ConcreteCuda_Keyswitch)
-    ->Apply(KeyswitchBenchmarkGenerateParams);
-
-BENCHMARK_REGISTER_F(Keyswitch_u64, ConcreteCuda_CopiesPlusKeyswitch)
-    ->Apply(KeyswitchBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_linear_algebra.cpp
@@ -1,254 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct {
-  int lwe_dimension;
-  int input_lwe_ciphertext_count;
-} LinearAlgebraBenchmarkParams;
-
-class LinearAlgebra_u64 : public benchmark::Fixture {
-protected:
-  int lwe_dimension;
-  double noise_variance = 2.9802322387695312e-08;
-  int ksk_base_log;
-  int ksk_level;
-  int message_modulus = 4;
-  int carry_modulus = 4;
-  int num_samples;
-  uint64_t delta;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *d_lwe_in_1_ct;
-  uint64_t *d_lwe_in_2_ct;
-  uint64_t *d_lwe_out_ct;
-  uint64_t *plaintexts_1;
-  uint64_t *plaintexts_2;
-  uint64_t *d_plaintext_2;
-  uint64_t *d_cleartext;
-  uint64_t *lwe_in_1_ct;
-  uint64_t *lwe_in_2_ct;
-  uint64_t *lwe_out_ct;
-  uint64_t *lwe_sk_array;
-
-public:
-  // Test arithmetic functions
-  void SetUp(const ::benchmark::State &state) {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    lwe_dimension = state.range(0);
-    num_samples = state.range(1);
-
-    int payload_modulus = message_modulus * carry_modulus;
-    // Value of the shift we multiply our messages by
-    delta = ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus);
-
-    linear_algebra_setup(
-        stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
-        &d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
-        &plaintexts_2, &d_plaintext_2, &d_cleartext, lwe_dimension,
-        noise_variance, payload_modulus, delta, num_samples, 1, 1, gpu_index);
-  }
-
-  void TearDown(const ::benchmark::State &state) {
-    linear_algebra_teardown(
-        stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
-        &d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
-        &plaintexts_2, &d_plaintext_2, &d_cleartext, gpu_index);
-  }
-};
-
-BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_Addition)
-(benchmark::State &st) {
-  // Execute addition
-  for (auto _ : st) {
-    cuda_add_lwe_ciphertext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-        (void *)d_lwe_in_2_ct, lwe_dimension, num_samples);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(num_samples / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusAddition)
-(benchmark::State &st) {
-  // Execute addition
-  for (auto _ : st) {
-
-    cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_memcpy_async_to_gpu(d_lwe_in_2_ct, lwe_in_2_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_add_lwe_ciphertext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-        (void *)d_lwe_in_2_ct, lwe_dimension, num_samples);
-
-    cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(num_samples / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_PlaintextAddition)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    // Execute addition
-    cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-        (void *)d_plaintext_2, lwe_dimension, num_samples);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(num_samples / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusPlaintextAddition)
-(benchmark::State &st) {
-  for (auto _ : st) {
-
-    cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_memcpy_async_to_gpu(d_plaintext_2, plaintexts_2,
-                             num_samples * sizeof(uint64_t), stream, gpu_index);
-    // Execute addition
-    cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-        (void *)d_plaintext_2, lwe_dimension, num_samples);
-
-    cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(num_samples / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CleartextMultiplication)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    // Execute addition
-    cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-        (void *)d_cleartext, lwe_dimension, num_samples);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(num_samples / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(LinearAlgebra_u64,
-                   ConcreteCuda_CopiesPlusCleartextMultiplication)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_memcpy_async_to_gpu(d_cleartext, plaintexts_2,
-                             num_samples * sizeof(uint64_t), stream, gpu_index);
-    // Execute addition
-    cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-        (void *)d_cleartext, lwe_dimension, num_samples);
-
-    cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(num_samples / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_Negation)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    // Execute addition
-    cuda_negate_lwe_ciphertext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-        lwe_dimension, num_samples);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(num_samples / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusNegation)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    cuda_memcpy_async_to_gpu(d_lwe_in_1_ct, lwe_in_1_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    // Execute addition
-    cuda_negate_lwe_ciphertext_vector_64(
-        stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_in_1_ct,
-        lwe_dimension, num_samples);
-
-    cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                             num_samples * (lwe_dimension + 1) *
-                                 sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(num_samples / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-static void
-LinearAlgebraBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
-  // Define the parameters to benchmark
-  // n, input_lwe_ciphertext_count
-  std::vector<LinearAlgebraBenchmarkParams> params = {
-      (LinearAlgebraBenchmarkParams){600, 100},
-  };
-
-  // Add to the list of parameters to benchmark
-  for (auto x : params)
-    b->Args({x.lwe_dimension, x.input_lwe_ciphertext_count});
-}
-
-BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_Addition)
-    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusAddition)
-    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_PlaintextAddition)
-    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebra_u64,
-                     ConcreteCuda_CopiesPlusPlaintextAddition)
-    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CleartextMultiplication)
-    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebra_u64,
-                     ConcreteCuda_CopiesPlusCleartextMultiplication)
-    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_Negation)
-    ->Apply(LinearAlgebraBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(LinearAlgebra_u64, ConcreteCuda_CopiesPlusNegation)
-    ->Apply(LinearAlgebraBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_multibit_pbs.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_multibit_pbs.cpp
@@ -1,183 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <omp.h>
-#include <setup_and_teardown.h>
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  int pbs_base_log;
-  int pbs_level;
-  int input_lwe_ciphertext_count;
-  int grouping_factor;
-  int chunk_size;
-} MultiBitPBSBenchmarkParams;
-
-class MultiBitBootstrap_u64 : public benchmark::Fixture {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  int input_lwe_ciphertext_count;
-  int input_lwe_ciphertext_count_per_gpu;
-  int grouping_factor;
-  double lwe_modular_variance = 0.000007069849454709433;
-  double glwe_modular_variance = 0.00000000000000029403601535432533;
-  int pbs_base_log;
-  int pbs_level;
-  int message_modulus = 4;
-  int carry_modulus = 4;
-  int payload_modulus;
-  uint64_t delta;
-  std::vector<uint64_t *> d_bsk_array;
-  std::vector<uint64_t *> d_lut_pbs_identity;
-  std::vector<uint64_t *> d_lut_pbs_indexes;
-  std::vector<uint64_t *> d_lwe_ct_in_array;
-  std::vector<uint64_t *> d_lwe_ct_out_array;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *plaintexts;
-  Csprng *csprng;
-  std::vector<int8_t *> pbs_buffer;
-
-  int chunk_size;
-
-  int num_gpus;
-  std::vector<cudaStream_t *> streams;
-
-public:
-  void SetUp(const ::benchmark::State &state) {
-
-    lwe_dimension = state.range(0);
-    glwe_dimension = state.range(1);
-    polynomial_size = state.range(2);
-    pbs_base_log = state.range(3);
-    pbs_level = state.range(4);
-    input_lwe_ciphertext_count = state.range(5);
-    grouping_factor = state.range(6);
-    chunk_size = state.range(7);
-
-    num_gpus = std::min(cuda_get_number_of_gpus(), input_lwe_ciphertext_count);
-
-    assert(input_lwe_ciphertext_count % num_gpus == 0);
-    input_lwe_ciphertext_count_per_gpu =
-        std::max(1, input_lwe_ciphertext_count / num_gpus);
-
-    // Create streams
-    for (int device = 0; device < num_gpus; device++) {
-      cudaSetDevice(device);
-      cudaStream_t *stream = cuda_create_stream(device);
-      streams.push_back(stream);
-
-      uint64_t *d_bsk_array_per_gpu;
-      uint64_t *d_lut_pbs_identity_per_gpu;
-      uint64_t *d_lut_pbs_indexes_per_gpu;
-      uint64_t *d_lwe_ct_in_array_per_gpu;
-      uint64_t *d_lwe_ct_out_array_per_gpu;
-      int8_t *pbs_buffer_per_gpu;
-
-      bootstrap_multibit_setup(
-          stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
-          &d_bsk_array_per_gpu, &plaintexts, &d_lut_pbs_identity_per_gpu,
-          &d_lut_pbs_indexes_per_gpu, &d_lwe_ct_in_array_per_gpu,
-          &d_lwe_ct_out_array_per_gpu, &pbs_buffer_per_gpu, lwe_dimension,
-          glwe_dimension, polynomial_size, grouping_factor,
-          lwe_modular_variance, glwe_modular_variance, pbs_base_log, pbs_level,
-          message_modulus, carry_modulus, &payload_modulus, &delta,
-          input_lwe_ciphertext_count_per_gpu, 1, 1, device, chunk_size);
-
-      d_bsk_array.push_back(d_bsk_array_per_gpu);
-      d_lut_pbs_identity.push_back(d_lut_pbs_identity_per_gpu);
-      d_lut_pbs_indexes.push_back(d_lut_pbs_indexes_per_gpu);
-      d_lwe_ct_in_array.push_back(d_lwe_ct_in_array_per_gpu);
-      d_lwe_ct_out_array.push_back(d_lwe_ct_out_array_per_gpu);
-      pbs_buffer.push_back(pbs_buffer_per_gpu);
-    }
-  }
-
-  void TearDown(const ::benchmark::State &state) {
-    concrete_cpu_destroy_concrete_csprng(csprng);
-    free(csprng);
-    free(lwe_sk_in_array);
-    free(lwe_sk_out_array);
-    free(plaintexts);
-
-    for (int device = 0; device < num_gpus; device++) {
-      cudaSetDevice(device);
-      cleanup_cuda_multi_bit_pbs(streams[device], device, &pbs_buffer[device]);
-      cuda_drop_async(d_bsk_array[device], streams[device], device);
-      cuda_drop_async(d_lut_pbs_identity[device], streams[device], device);
-      cuda_drop_async(d_lut_pbs_indexes[device], streams[device], device);
-      cuda_drop_async(d_lwe_ct_in_array[device], streams[device], device);
-      cuda_drop_async(d_lwe_ct_out_array[device], streams[device], device);
-      cuda_synchronize_stream(streams[device]);
-      cuda_destroy_stream(streams[device], device);
-    }
-    d_bsk_array.clear();
-    d_lut_pbs_identity.clear();
-    d_lut_pbs_indexes.clear();
-    d_lwe_ct_in_array.clear();
-    d_lwe_ct_out_array.clear();
-    pbs_buffer.clear();
-    streams.clear();
-    cudaDeviceReset();
-  }
-};
-
-BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit)
-(benchmark::State &st) {
-
-  for (auto _ : st) {
-#pragma omp parallel for num_threads(num_gpus)
-    for (int device = 0; device < num_gpus; device++) {
-      cudaSetDevice(device);
-      // Execute PBS
-      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-          streams[device], device, (void *)d_lwe_ct_out_array[device],
-          (void *)d_lut_pbs_identity[device], (void *)d_lut_pbs_indexes[device],
-          (void *)d_lwe_ct_in_array[device], (void *)d_bsk_array[device],
-          pbs_buffer[device], lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, pbs_base_log, pbs_level,
-          input_lwe_ciphertext_count_per_gpu, 1, 0,
-          cuda_get_max_shared_memory(device), chunk_size);
-    }
-
-    for (int device = 0; device < num_gpus; device++) {
-      cudaSetDevice(device);
-      cuda_synchronize_stream(streams[device]);
-    }
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(input_lwe_ciphertext_count / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-static void
-MultiBitPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
-  // Define the parameters to benchmark
-  // lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-  // input_lwe_ciphertext_count
-  std::vector<MultiBitPBSBenchmarkParams> params = {
-      // 4_bits_multi_bit_group_2
-      (MultiBitPBSBenchmarkParams){818, 1, 2048, 22, 1, 1, 2},
-      // 4_bits_multi_bit_group_3
-      (MultiBitPBSBenchmarkParams){888, 1, 2048, 21, 1, 1, 3},
-      (MultiBitPBSBenchmarkParams){742, 1, 2048, 23, 1, 1, 2},
-      (MultiBitPBSBenchmarkParams){744, 1, 2048, 23, 1, 1, 3},
-  };
-
-  // Add to the list of parameters to benchmark
-  for (auto x : params) {
-    for (int input_lwe_ciphertext_count = 1;
-         input_lwe_ciphertext_count <= 16384; input_lwe_ciphertext_count *= 2)
-      b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
-               x.pbs_base_log, x.pbs_level, input_lwe_ciphertext_count,
-               x.grouping_factor, 0});
-  }
-}
-
-BENCHMARK_REGISTER_F(MultiBitBootstrap_u64, ConcreteCuda_MultiBit)
-    ->Apply(MultiBitPBSBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/benchmark_wop_bootstrap.cpp
@@ -1,184 +0,0 @@
-#include <benchmark/benchmark.h>
-#include <cstdint>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-const unsigned MAX_TAU = 4;
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  int pbs_base_log;
-  int pbs_level;
-  int ks_base_log;
-  int ks_level;
-  int pksk_base_log;
-  int pksk_level;
-  int cbs_base_log;
-  int cbs_level;
-  int tau;
-  int p;
-} WopPBSBenchmarkParams;
-
-class WopPBS_u64 : public benchmark::Fixture {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance = 7.52316384526264e-37;
-  double glwe_modular_variance = 7.52316384526264e-37;
-  int pbs_base_log;
-  int pbs_level;
-  int ks_base_log;
-  int ks_level;
-  int pksk_base_log;
-  int pksk_level;
-  int cbs_base_log;
-  int cbs_level;
-  int tau;
-  uint32_t p_array[MAX_TAU];
-  int input_lwe_dimension;
-  uint64_t delta_array[MAX_TAU];
-  int cbs_delta_log;
-  uint32_t delta_log_array[MAX_TAU];
-  int delta_log_lut;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *plaintexts;
-  double *d_fourier_bsk;
-  uint64_t *lwe_sk_in;
-  uint64_t *lwe_sk_out;
-  uint64_t *d_ksk;
-  uint64_t *d_pksk;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *d_lwe_ct_out_array;
-  uint64_t *d_lut_vector;
-  int8_t *wop_pbs_buffer;
-  uint64_t *lwe_ct_in_array;
-  uint64_t *lwe_ct_out_array;
-
-public:
-  // Test arithmetic functions
-  void SetUp(const ::benchmark::State &state) {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    lwe_dimension = state.range(0);
-    glwe_dimension = state.range(1);
-    polynomial_size = state.range(2);
-    pbs_base_log = state.range(3);
-    pbs_level = state.range(4);
-    ks_base_log = state.range(5);
-    ks_level = state.range(6);
-    pksk_base_log = state.range(7);
-    pksk_level = state.range(8);
-    cbs_base_log = state.range(9);
-    cbs_level = state.range(10);
-    tau = state.range(11);
-    p_array[0] = state.range(12);
-    wop_pbs_setup(stream, &csprng, &lwe_sk_in, &lwe_sk_out, &d_ksk,
-                  &d_fourier_bsk, &d_pksk, &plaintexts, &d_lwe_ct_in_array,
-                  &d_lwe_ct_out_array, &d_lut_vector, &wop_pbs_buffer,
-                  lwe_dimension, glwe_dimension, polynomial_size,
-                  lwe_modular_variance, glwe_modular_variance, ks_base_log,
-                  ks_level, pksk_base_log, pksk_level, pbs_base_log, pbs_level,
-                  cbs_level, p_array, delta_log_array, &cbs_delta_log,
-                  delta_array, tau, 1, 1, gpu_index);
-
-    // We keep the following for the benchmarks with copies
-    lwe_ct_in_array = (uint64_t *)malloc(
-        (glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
-    lwe_ct_out_array = (uint64_t *)malloc(
-        (glwe_dimension * polynomial_size + 1) * tau * sizeof(uint64_t));
-    for (int i = 0; i < tau; i++) {
-      uint64_t plaintext = plaintexts[i];
-      uint64_t *lwe_ct_in =
-          lwe_ct_in_array +
-          (ptrdiff_t)(i * (glwe_dimension * polynomial_size + 1));
-      concrete_cpu_encrypt_lwe_ciphertext_u64(
-          lwe_sk_in, lwe_ct_in, plaintext, glwe_dimension * polynomial_size,
-          lwe_modular_variance, csprng, &CONCRETE_CSPRNG_VTABLE);
-    }
-  }
-
-  void TearDown(const ::benchmark::State &state) {
-    wop_pbs_teardown(stream, csprng, lwe_sk_in, lwe_sk_out, d_ksk,
-                     d_fourier_bsk, d_pksk, plaintexts, d_lwe_ct_in_array,
-                     d_lut_vector, d_lwe_ct_out_array, wop_pbs_buffer,
-                     gpu_index);
-    free(lwe_ct_in_array);
-    free(lwe_ct_out_array);
-  }
-};
-
-BENCHMARK_DEFINE_F(WopPBS_u64, ConcreteCuda_WopPBS)(benchmark::State &st) {
-  for (auto _ : st) {
-    // Execute wop pbs
-    cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array,
-                    (void *)d_lwe_ct_in_array, (void *)d_lut_vector,
-                    (void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk,
-                    wop_pbs_buffer, cbs_delta_log, glwe_dimension,
-                    lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-                    ks_base_log, ks_level, pksk_base_log, pksk_level,
-                    cbs_base_log, cbs_level, p_array, delta_log_array, tau,
-                    cuda_get_max_shared_memory(gpu_index));
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(tau * p_array[0] / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-BENCHMARK_DEFINE_F(WopPBS_u64, ConcreteCuda_CopiesPlusWopPBS)
-(benchmark::State &st) {
-  for (auto _ : st) {
-    cuda_memcpy_async_to_gpu(d_lwe_ct_in_array, lwe_ct_in_array,
-                             (input_lwe_dimension + 1) * tau * sizeof(uint64_t),
-                             stream, gpu_index);
-    // Execute wop pbs
-    cuda_wop_pbs_64(stream, gpu_index, (void *)d_lwe_ct_out_array,
-                    (void *)d_lwe_ct_in_array, (void *)d_lut_vector,
-                    (void *)d_fourier_bsk, (void *)d_ksk, (void *)d_pksk,
-                    wop_pbs_buffer, cbs_delta_log, glwe_dimension,
-                    lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-                    ks_base_log, ks_level, pksk_base_log, pksk_level,
-                    cbs_base_log, cbs_level, p_array, delta_log_array, tau,
-                    cuda_get_max_shared_memory(gpu_index));
-
-    cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
-                             (input_lwe_dimension + 1) * tau * sizeof(uint64_t),
-                             stream, gpu_index);
-    cuda_synchronize_stream(stream);
-  }
-  st.counters["Throughput"] =
-      benchmark::Counter(tau * p_array[0] / get_aws_cost_per_second(),
-                         benchmark::Counter::kIsIterationInvariantRate);
-}
-
-static void WopPBSBenchmarkGenerateParams(benchmark::internal::Benchmark *b) {
-  // Define the parameters to benchmark
-  // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
-  // ks_base_log, ks_level, tau, p
-  std::vector<WopPBSBenchmarkParams> params = {
-      (WopPBSBenchmarkParams){481, 2, 512, 4, 9, 1, 9, 4, 9, 6, 4, 1, 10},
-      //// INTEGER_PARAM_MESSAGE_4_CARRY_4_16_BITS
-      //(WopPBSBenchmarkParams){481, 1, 2048, 9, 4, 1, 9, 9, 4, 6, 4, 1, 8},
-      //// INTEGER_PARAM_MESSAGE_2_CARRY_2_16_BITS
-      //(WopPBSBenchmarkParams){493, 1, 2048, 16, 2, 2, 5, 16, 2, 6, 4, 1, 4},
-  };
-
-  // Add to the list of parameters to benchmark
-  for (auto x : params)
-    b->Args({x.lwe_dimension, x.glwe_dimension, x.polynomial_size,
-             x.pbs_base_log, x.pbs_level, x.ks_base_log, x.ks_level,
-             x.pksk_base_log, x.pksk_level, x.cbs_base_log, x.cbs_level, x.tau,
-             x.p});
-}
-
-BENCHMARK_REGISTER_F(WopPBS_u64, ConcreteCuda_WopPBS)
-    ->Apply(WopPBSBenchmarkGenerateParams);
-BENCHMARK_REGISTER_F(WopPBS_u64, ConcreteCuda_CopiesPlusWopPBS)
-    ->Apply(WopPBSBenchmarkGenerateParams);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/main.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/benchmark/main.cpp
@@ -1,3 +0,0 @@
-#include <benchmark/benchmark.h>
-
-BENCHMARK_MAIN();
--- a/backends/concrete-cuda/implementation/test_and_benchmark/include/setup_and_teardown.h
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/include/setup_and_teardown.h
@@ -1,228 +0,0 @@
-#ifndef SETUP_AND_TEARDOWN_H
-#define SETUP_AND_TEARDOWN_H
-
-#include <bit_extraction.h>
-#include <bootstrap.h>
-#include <bootstrap_multibit.h>
-#include <circuit_bootstrap.h>
-#include <concrete-cpu.h>
-#include <device.h>
-#include <keyswitch.h>
-#include <linear_algebra.h>
-#include <utils.h>
-#include <vertical_packing.h>
-
-template <typename Torus> struct int_mul_memory {
-  Torus *vector_result_sb;
-  Torus *block_mul_res;
-  Torus *small_lwe_vector;
-  Torus *lwe_pbs_out_array;
-  Torus *test_vector_array;
-  Torus *message_acc;
-  Torus *carry_acc;
-  Torus *test_vector_indexes;
-  Torus *tvi_message;
-  Torus *tvi_carry;
-  int8_t *pbs_buffer;
-
-  int p2p_gpu_count = 0;
-
-  cudaStream_t *streams[32];
-
-  int8_t *pbs_buffer_multi_gpu[32];
-  Torus *pbs_input_multi_gpu[32];
-  Torus *pbs_output_multi_gpu[32];
-  Torus *test_vector_multi_gpu[32];
-  Torus *tvi_lsb_multi_gpu[32];
-  Torus *tvi_msb_multi_gpu[32];
-  Torus *tvi_message_multi_gpu[32];
-  Torus *tvi_carry_multi_gpu[32];
-  Torus *bsk_multi_gpu[32];
-  Torus *ksk_multi_gpu[32];
-
-  Torus *device_to_device_buffer[8];
-
-  bool IsAppBuiltAs64() { return sizeof(void *) == 8; }
-};
-
-
-void bootstrap_classical_setup(
-    cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
-    uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
-    uint64_t **plaintexts, uint64_t **d_lut_pbs_identity,
-    uint64_t **d_lut_pbs_indexes, uint64_t **d_lwe_ct_in_array,
-    uint64_t **d_lwe_ct_out_array, int lwe_dimension, int glwe_dimension,
-    int polynomial_size, double lwe_modular_variance,
-    double glwe_modular_variance, int pbs_base_log, int pbs_level,
-    int message_modulus, int carry_modulus, int *payload_modulus,
-    uint64_t *delta, int number_of_inputs, int repetitions, int samples,
-    int gpu_index);
-void bootstrap_classical_teardown(
-    cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
-    uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
-    uint64_t *plaintexts, uint64_t *d_lut_pbs_identity,
-    uint64_t *d_lut_pbs_indexes, uint64_t *d_lwe_ct_in_array,
-    uint64_t *d_lwe_ct_out_array, int gpu_index);
-
-void bootstrap_multibit_setup(
-    cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
-    uint64_t **lwe_sk_out_array, uint64_t **d_bsk_array, uint64_t **plaintexts,
-    uint64_t **d_lut_pbs_identity, uint64_t **d_lut_pbs_indexes,
-    uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_ct_out_array,
-    int8_t **pbs_buffer, int lwe_dimension, int glwe_dimension,
-    int polynomial_size, int grouping_factor, double lwe_modular_variance,
-    double glwe_modular_variance, int pbs_base_log, int pbs_level,
-    int message_modulus, int carry_modulus, int *payload_modulus,
-    uint64_t *delta, int number_of_inputs, int repetitions, int samples,
-    int gpu_index, int chunk_size = 0);
-
-void bootstrap_multibit_teardown(
-    cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
-    uint64_t *lwe_sk_out_array, uint64_t *d_bsk_array, uint64_t *plaintexts,
-    uint64_t *d_lut_pbs_identity, uint64_t *d_lut_pbs_indexes,
-    uint64_t *d_lwe_ct_in_array, uint64_t *d_lwe_ct_out_array,
-    int8_t **pbs_buffer, int gpu_index);
-
-void keyswitch_setup(cudaStream_t *stream, Csprng **csprng,
-                     uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
-                     uint64_t **d_ksk_array, uint64_t **plaintexts,
-                     uint64_t **d_lwe_ct_in_array,
-                     uint64_t **d_lwe_ct_out_array, int input_lwe_dimension,
-                     int output_lwe_dimension, double lwe_modular_variance,
-                     int ksk_base_log, int ksk_level, int message_modulus,
-                     int carry_modulus, int *payload_modulus, uint64_t *delta,
-                     int number_of_inputs, int repetitions, int samples,
-                     int gpu_index);
-void keyswitch_teardown(cudaStream_t *stream, Csprng *csprng,
-                        uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
-                        uint64_t *d_ksk_array, uint64_t *plaintexts,
-                        uint64_t *d_lwe_ct_in_array,
-                        uint64_t *d_lwe_ct_out_array, int gpu_index);
-
-void bit_extraction_setup(
-    cudaStream_t **stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
-    uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
-    uint64_t **d_ksk_array, uint64_t **plaintexts, uint64_t **d_lwe_ct_in_array,
-    uint64_t **d_lwe_ct_out_array, int8_t **bit_extract_buffer_array,
-    int lwe_dimension, int glwe_dimension, int polynomial_size,
-    double lwe_modular_variance, double glwe_modular_variance, int ks_base_log,
-    int ks_level, int pbs_base_log, int pbs_level,
-    uint32_t *number_of_bits_of_message_including_padding_array,
-    uint32_t *number_of_bits_to_extract_array, uint32_t *delta_log_array,
-    uint64_t *delta_array, int crt_decomposition_size, int repetitions,
-    int samples, int gpu_index);
-
-void bit_extraction_teardown(cudaStream_t **stream, Csprng *csprng,
-                             uint64_t *lwe_sk_in_array,
-                             uint64_t *lwe_sk_out_array,
-                             double *d_fourier_bsk_array, uint64_t *d_ksk_array,
-                             uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
-                             uint64_t *d_lwe_ct_out_array,
-                             int8_t **bit_extract_buffer_array, int samples,
-                             int gpu_index);
-
-void circuit_bootstrap_setup(
-    cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
-    uint64_t **lwe_sk_out_array, double **d_fourier_bsk_array,
-    uint64_t **d_pksk_array, uint64_t **plaintexts,
-    uint64_t **d_lwe_ct_in_array, uint64_t **d_ggsw_ct_out_array,
-    uint64_t **d_lut_vector_indexes, int8_t **cbs_buffer, int lwe_dimension,
-    int glwe_dimension, int polynomial_size, double lwe_modular_variance,
-    double glwe_modular_variance, int pksk_base_log, int pksk_level,
-    int pbs_base_log, int pbs_level, int cbs_level,
-    int number_of_bits_of_message_including_padding, int ggsw_size,
-    int *delta_log, uint64_t *delta, int number_of_inputs, int repetitions,
-    int samples, int gpu_index);
-
-void circuit_bootstrap_teardown(
-    cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
-    uint64_t *lwe_sk_out_array, double *d_fourier_bsk_array,
-    uint64_t *d_pksk_array, uint64_t *plaintexts, uint64_t *d_lwe_ct_in_array,
-    uint64_t *d_lut_vector_indexes, uint64_t *d_ggsw_ct_out_array,
-    int8_t *cbs_buffer, int gpu_index);
-
-void cmux_tree_setup(cudaStream_t *stream, Csprng **csprng, uint64_t **glwe_sk,
-                     uint64_t **d_lut_identity, uint64_t **plaintexts,
-                     uint64_t **d_ggsw_bit_array, int8_t **cmux_tree_buffer,
-                     uint64_t **d_glwe_out, int glwe_dimension,
-                     int polynomial_size, int base_log, int level_count,
-                     double glwe_modular_variance, int r_lut, int tau,
-                     uint32_t *delta_log, int repetitions, int samples,
-                     int gpu_index);
-void cmux_tree_teardown(cudaStream_t *stream, Csprng **csprng,
-                        uint64_t **glwe_sk, uint64_t **d_lut_identity,
-                        uint64_t **plaintexts, uint64_t **d_ggsw_bit_array,
-                        int8_t **cmux_tree_buffer, uint64_t **d_glwe_out,
-                        int gpu_index);
-
-void wop_pbs_setup(cudaStream_t *stream, Csprng **csprng,
-                   uint64_t **lwe_sk_in_array, uint64_t **lwe_sk_out_array,
-                   uint64_t **d_ksk_array, double **d_fourier_bsk_array,
-                   uint64_t **d_pksk_array, uint64_t **plaintexts,
-                   uint64_t **d_lwe_ct_in_array, uint64_t **d_lwe_ct_out_array,
-                   uint64_t **d_lut_vector, int8_t **wop_pbs_buffer,
-                   int lwe_dimension, int glwe_dimension, int polynomial_size,
-                   double lwe_modular_variance, double glwe_modular_variance,
-                   int ks_base_log, int ks_level, int pksk_base_log,
-                   int pksk_level, int pbs_base_log, int pbs_level,
-                   int cbs_level, uint32_t *p_array, uint32_t *delta_log_array,
-                   int *cbs_delta_log, uint64_t *delta_array, int tau,
-                   int repetitions, int samples, int gpu_index);
-
-void wop_pbs_teardown(cudaStream_t *stream, Csprng *csprng,
-                      uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
-                      uint64_t *d_ksk_array, double *d_fourier_bsk_array,
-                      uint64_t *d_pksk_array, uint64_t *plaintexts,
-                      uint64_t *d_lwe_ct_in_array, uint64_t *d_lut_vector,
-                      uint64_t *d_lwe_ct_out_array, int8_t *wop_pbs_buffer,
-                      int gpu_index);
-
-void linear_algebra_setup(cudaStream_t *stream, Csprng **csprng,
-                          uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
-                          uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
-                          uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
-                          uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
-                          uint64_t **plaintexts_2, uint64_t **d_plaintext_2,
-                          uint64_t **d_plaintexts_2_mul, int lwe_dimension,
-                          double noise_variance, int payload_modulus,
-                          uint64_t delta, int number_of_inputs, int repetitions,
-                          int samples, int gpu_index);
-
-void linear_algebra_teardown(cudaStream_t *stream, Csprng **csprng,
-                             uint64_t **lwe_sk_array, uint64_t **d_lwe_in_1_ct,
-                             uint64_t **d_lwe_in_2_ct, uint64_t **d_lwe_out_ct,
-                             uint64_t **lwe_in_1_ct, uint64_t **lwe_in_2_ct,
-                             uint64_t **lwe_out_ct, uint64_t **plaintexts_1,
-                             uint64_t **plaintexts_2, uint64_t **d_plaintext_2,
-                             uint64_t **d_plaintexts_2_mul, int gpu_index);
-void fft_setup(cudaStream_t *stream, double **poly1, double **poly2,
-               double2 **h_cpoly1, double2 **h_cpoly2, double2 **d_cpoly1,
-               double2 **d_cpoly2, size_t polynomial_size, int samples,
-               int gpu_index);
-
-void fft_teardown(cudaStream_t *stream, double *poly1, double *poly2,
-                  double2 *h_cpoly1, double2 *h_cpoly2, double2 *d_cpoly1,
-                  double2 *d_cpoly2, int gpu_index);
-
-
-void integer_multiplication_setup(
-    cudaStream_t *stream, Csprng **csprng, uint64_t **lwe_sk_in_array,
-    uint64_t **lwe_sk_out_array, void **d_bsk_array, uint64_t **d_ksk_array,
-    uint64_t **plaintexts_1, uint64_t **plaintexts_2,
-    uint64_t **d_lwe_ct_in_array_1, uint64_t **d_lwe_ct_in_array_2,
-    uint64_t **d_lwe_ct_out_array, int_mul_memory<uint64_t> *mem_ptr,
-    int lwe_dimension, int glwe_dimension, int polynomial_size,
-    double lwe_modular_variance, double glwe_modular_variance, int pbs_base_log,
-    int pbs_level, int ksk_base_log, int ksk_level, int total_message_bits,
-    int number_of_blocks, int message_modulus, int carry_modulus,
-    uint64_t *delta, int repetitions, int samples, PBS_TYPE pbs_type, int gpu_index);
-
-void integer_multiplication_teardown(
-    cudaStream_t *stream, Csprng *csprng, uint64_t *lwe_sk_in_array,
-    uint64_t *lwe_sk_out_array, void *d_bsk_array, uint64_t *d_ksk_array,
-    uint64_t *plaintexts_1, uint64_t *plaintexts_2,
-    uint64_t *d_lwe_ct_in_array_1, uint64_t *d_lwe_ct_in_array_2,
-    uint64_t *d_lwe_ct_out_array, int_mul_memory<uint64_t> *mem_ptr);
-
-
-#endif // SETUP_AND_TEARDOWN_H
--- a/backends/concrete-cuda/implementation/test_and_benchmark/include/utils.h
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/include/utils.h
@@ -1,85 +0,0 @@
-#ifndef UTILS_H
-#define UTILS_H
-
-#include <concrete-cpu.h>
-#include <device.h>
-#include <functional>
-#include <tfhe.h>
-
-// This is the price per hour of a p3.2xlarge instance on Amazon AWS
-#define AWS_VM_COST_PER_HOUR (double)3.06
-
-double get_aws_cost_per_second();
-
-uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
-                              int number_of_inputs, const unsigned repetitions,
-                              const unsigned samples);
-
-uint64_t *generate_plaintexts_bit_extract(uint64_t *payload_modulus,
-                                          uint64_t *delta,
-                                          int crt_decomposition_size,
-                                          const unsigned repetitions,
-                                          const unsigned samples);
-
-uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
-                                    int message_modulus, int carry_modulus,
-                                    std::function<uint64_t(uint64_t)> func);
-
-uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int num_lut,
-                                          int tau, int delta_log);
-
-void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
-                              Csprng *csprng, const unsigned repetitions);
-
-void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
-                               int polynomial_size, Csprng *csprng,
-                               const unsigned repetitions);
-
-void generate_lwe_bootstrap_keys(
-    cudaStream_t *stream, int gpu_index, double **d_fourier_bsk_array,
-    uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension,
-    int glwe_dimension, int polynomial_size, int pbs_level, int pbs_base_log,
-    Csprng *csprng, double variance, const unsigned repetitions);
-
-void generate_lwe_multi_bit_pbs_keys(
-    cudaStream_t *stream, int gpu_index, uint64_t **d_bsk_array,
-    uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension,
-    int glwe_dimension, int polynomial_size, int pbs_level, int pbs_base_log,
-    int grouping_factor, Csprng *csprng, double variance,
-    const unsigned repetitions);
-
-void generate_lwe_keyswitch_keys(cudaStream_t *stream, int gpu_index,
-                                 uint64_t **d_ksk_array,
-                                 uint64_t *lwe_sk_in_array,
-                                 uint64_t *lwe_sk_out_array,
-                                 int input_lwe_dimension,
-                                 int output_lwe_dimension, int ksk_level,
-                                 int ksk_base_log, Csprng *csprng,
-                                 double variance, const unsigned repetitions);
-
-void generate_lwe_private_functional_keyswitch_key_lists(
-    cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array,
-    uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
-    int input_lwe_dimension, int output_glwe_dimension,
-    int output_polynomial_size, int pksk_level, int pksk_base_log,
-    Csprng *csprng, double variance, const unsigned repetitions);
-
-uint64_t closest_representable(uint64_t input, int level_count, int base_log);
-
-uint64_t *bit_decompose_value(uint64_t value, int r);
-
-uint64_t number_of_inputs_on_gpu(uint64_t gpu_index,
-                                 uint64_t lwe_ciphertext_count,
-                                 uint64_t number_of_gpus);
-
-
-void encrypt_integer_u64_blocks(uint64_t **ct, uint64_t *lwe_sk,
-                                uint64_t *message_blocks, int lwe_dimension,
-                                int num_blocks, Csprng *csprng,
-                                double variance);
-void decrypt_integer_u64_blocks(uint64_t *ct, uint64_t *lwe_sk,
-                                uint64_t **message_blocks, int lwe_dimension,
-                                int num_blocks, uint64_t delta,
-                                int message_modulus);
-
-#endif
--- a/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/setup_and_teardown.cpp
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/CMakeLists.txt
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/CMakeLists.txt
@@ -1,87 +0,0 @@
-find_package(CUDA REQUIRED)
-find_package(CUDAToolkit REQUIRED)
-
-include(FetchContent)
-FetchContent_Declare(googletest
-                     URL https://github.com/google/googletest/archive/03597a01ee50ed33e9dfd640b249b4be3799d395.zip)
-
-# For Windows: Prevent overriding the parent project's compiler/linker settings
-set(gtest_force_shared_crt
-    ON
-    CACHE BOOL "" FORCE)
-FetchContent_MakeAvailable(googletest)
-
-set(CONCRETE_CPU_BINARY_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation/target/release")
-set(CONCRETE_CPU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../../concrete-cpu/implementation")
-set(CONCRETE_CUDA_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../")
-
-# Enable ExternalProject CMake module
-include(ExternalProject)
-
-if(NOT TARGET concrete_cpu)
-  ExternalProject_Add(
-    concrete_cpu
-    SOURCE_DIR ${CONCRETE_CPU_SOURCE_DIR}
-    DOWNLOAD_COMMAND ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND cargo +nightly build --release --features=nightly
-    COMMAND cargo +nightly build --release --features=nightly
-    BINARY_DIR ${CONCRETE_CPU_BINARY_DIR}
-    BUILD_ALWAYS true
-    INSTALL_COMMAND ""
-    LOG_BUILD ON)
-endif()
-
-set(TFHE_RS_SOURCE_DIR "${CMAKE_BINARY_DIR}/tfhe-rs")
-set(TFHE_RS_BINARY_DIR "${TFHE_RS_SOURCE_DIR}/target/release")
-
-if(NOT TARGET tfhe-rs)
-  ExternalProject_Add(
-    tfhe-rs
-    GIT_REPOSITORY https://github.com/zama-ai/tfhe-rs.git
-    GIT_TAG main
-    SOURCE_DIR ${TFHE_RS_SOURCE_DIR}
-    BUILD_IN_SOURCE 1
-    UPDATE_COMMAND ""
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND make build_c_api
-    INSTALL_COMMAND ""
-    LOG_BUILD ON)
-endif()
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../include)
-include_directories(${CONCRETE_CPU_SOURCE_DIR}/include)
-include_directories(${CONCRETE_CUDA_SOURCE_DIR}/include)
-include_directories(${TFHE_RS_BINARY_DIR})
-include_directories("${CUDA_INCLUDE_DIRS}" "${CMAKE_CURRENT_SOURCE_DIR}")
-
-add_library(concrete_cpu_lib STATIC IMPORTED)
-add_dependencies(concrete_cpu_lib concrete_cpu)
-set_target_properties(concrete_cpu_lib PROPERTIES IMPORTED_LOCATION ${CONCRETE_CPU_BINARY_DIR}/libconcrete_cpu.a)
-
-add_library(tfhe_rs_lib STATIC IMPORTED)
-add_dependencies(tfhe_rs_lib tfhe-rs)
-set_target_properties(tfhe_rs_lib PROPERTIES IMPORTED_LOCATION ${TFHE_RS_BINARY_DIR}/libtfhe.a)
-
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,--no-as-needed,--allow-multiple-definition -ldl")
-
-set(BINARY test_concrete_cuda)
-
-file(
-  GLOB_RECURSE TEST_SOURCES
-  LIST_DIRECTORIES false
-  test_*.cpp)
-
-add_executable(${BINARY} ${TEST_SOURCES} ../utils.cpp ../setup_and_teardown.cpp)
-
-add_test(NAME ${BINARY} COMMAND ${BINARY})
-
-set_target_properties(
-  test_concrete_cuda
-  PROPERTIES CUDA_SEPARABLE_COMPILATION ON
-             CUDA_RESOLVE_DEVICE_SYMBOLS ON
-             CUDA_ARCHITECTURES native)
-target_link_libraries(test_concrete_cuda PUBLIC GTest::gtest_main concrete_cpu_lib tfhe_rs_lib concrete_cuda cudart)
-
-include(GoogleTest)
-gtest_discover_tests(test_concrete_cuda)
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/README.md
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/README.md
@@ -1,60 +0,0 @@
-# test_concrete_cuda
-
-This test tool is written over GoogleTest library. It tests the correctness of the concrete framework's CUDA-accelerated functions and helps identify arithmetic flaws.
-The output format can be adjusted according to the user's interest.
-
-A particular function will be executed for each test case, and the result will be verified considering the expected behavior. This will be repeated for multiple encryption keys and samples per key. These can be modified by changing `REPETITIONS` and `SAMPLES` variables at the beginning of each test file.
-
-## How to Compile
-
-The first step in compiling code with CMake is to create a build directory. This directory will
-contain all the files generated during the build process, such as object files and executables.
-We recommend creating this directory outside of the source directory, but inside the
-implementation folder,  to keep the source directory clean.
-
-```bash
-$ cd concrete/backends/concrete-cuda/implementation
-$ mkdir build
-$ cd build
-```
-
-Run CMake to generate the build files and then use make to compile the project.
-
-```bash
-$ cmake ..
-$ make
-```
-
-The binary will be found in
-`concrete/backends/concrete-cuda/implementation/build/test_and_benchmark/test`.
-
-## How to Run Tests
-
-To run tests, you can simply execute the `test_concrete_cuda` executable with no arguments:
-
-```bash
-$ test_and_benchmark/test/test_concrete_cuda
-```
-
-This will run all the available tests.
-
-## How to Filter Tests
-
-You can select a subset of sets by specifying a filter for the name of the tests of interest  as
-an argument. Only tests whose full name matches the filter will be executed.
-
-For example, to run only tests whose name starts with the word "Bootstrap", you can execute:
-
-```bash
-$ test_and_benchmark/test/test_concrete_cuda --gtest_filter=Bootstrap*
-```
-
-The parameter `--gtest_list_tests` can be used to list all the available tests, and a better
-description on how to select a subset of tests can be found in
-[GoogleTest documentation](http://google.github.io/googletest/advanced.html#running-a-subset-of-the-tests).
-
-## Conclusion
-
-With these options, you can easily verify the correctness of concrete-cuda's implementations. If
-you have any questions or issues, please feel free to contact us.
-To learn more about GoogleTest library, please refer to the [official user guide](http://google.github.io/googletest/).
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_bit_extraction.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_bit_extraction.cpp
@@ -1,243 +0,0 @@
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-const unsigned REPETITIONS = 2;
-const unsigned MAX_INPUTS = 4;
-const unsigned SAMPLES = 10;
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int ks_base_log;
-  int ks_level;
-  uint32_t number_of_bits_of_message_including_padding_array[MAX_INPUTS];
-  uint32_t number_of_bits_to_extract_array[MAX_INPUTS];
-  int number_of_inputs;
-} BitExtractionTestParams;
-
-class BitExtractionTestPrimitives_u64
-    : public ::testing::TestWithParam<BitExtractionTestParams> {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int ks_base_log;
-  int ks_level;
-  uint32_t number_of_bits_of_message_including_padding_array[MAX_INPUTS];
-  uint32_t number_of_bits_to_extract_array[MAX_INPUTS];
-  int number_of_inputs;
-  uint64_t delta_array[MAX_INPUTS];
-  uint32_t delta_log_array[MAX_INPUTS];
-  Csprng *csprng;
-  cudaStream_t *stream_array[SAMPLES];
-  int gpu_index = 0;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *lwe_ct_in_array;
-  uint64_t *plaintexts;
-  double *d_fourier_bsk_array;
-  uint64_t *d_ksk_array;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *d_lwe_ct_out_array;
-  int8_t *bit_extract_buffer_array[SAMPLES];
-  int input_lwe_dimension;
-  int output_lwe_dimension;
-
-public:
-  // Test arithmetic functions
-  void SetUp() {
-    for (size_t i = 0; i < SAMPLES; i++) {
-      stream_array[i] = cuda_create_stream(0);
-    }
-
-    // TestParams
-    lwe_dimension = (int)GetParam().lwe_dimension;
-    glwe_dimension = (int)GetParam().glwe_dimension;
-    polynomial_size = (int)GetParam().polynomial_size;
-    lwe_modular_variance = (double)GetParam().lwe_modular_variance;
-    glwe_modular_variance = (double)GetParam().glwe_modular_variance;
-    pbs_base_log = (int)GetParam().pbs_base_log;
-    pbs_level = (int)GetParam().pbs_level;
-    ks_base_log = (int)GetParam().ks_base_log;
-    ks_level = (int)GetParam().ks_level;
-    for (size_t i = 0; i < MAX_INPUTS; i++) {
-      number_of_bits_to_extract_array[i] =
-          (int)GetParam().number_of_bits_to_extract_array[i];
-      number_of_bits_of_message_including_padding_array[i] =
-          (int)GetParam().number_of_bits_of_message_including_padding_array[i];
-    }
-    number_of_inputs = (int)GetParam().number_of_inputs;
-    input_lwe_dimension = glwe_dimension * polynomial_size;
-    output_lwe_dimension = lwe_dimension;
-
-    bit_extraction_setup(
-        stream_array, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
-        &d_fourier_bsk_array, &d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
-        &d_lwe_ct_out_array, bit_extract_buffer_array, lwe_dimension,
-        glwe_dimension, polynomial_size, lwe_modular_variance,
-        glwe_modular_variance, ks_base_log, ks_level, pbs_base_log, pbs_level,
-        number_of_bits_of_message_including_padding_array,
-        number_of_bits_to_extract_array, delta_log_array, delta_array,
-        number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
-  }
-
-  void TearDown() {
-    bit_extraction_teardown(stream_array, csprng, lwe_sk_in_array,
-                            lwe_sk_out_array, d_fourier_bsk_array, d_ksk_array,
-                            plaintexts, d_lwe_ct_in_array, d_lwe_ct_out_array,
-                            bit_extract_buffer_array, SAMPLES, gpu_index);
-  }
-};
-
-TEST_P(BitExtractionTestPrimitives_u64, bit_extraction) {
-  int total_bits_to_extract = 0;
-  for (int i = 0; i < number_of_inputs; i++) {
-    total_bits_to_extract += number_of_bits_to_extract_array[i];
-  }
-
-  uint64_t *lwe_ct_out_array =
-      (uint64_t *)malloc((output_lwe_dimension + 1) * total_bits_to_extract *
-                         SAMPLES * sizeof(uint64_t));
-
-  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
-                 polynomial_size * (output_lwe_dimension + 1);
-  int ksk_size = ks_level * input_lwe_dimension * (output_lwe_dimension + 1);
-  for (uint r = 0; r < REPETITIONS; r++) {
-    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
-    uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
-    uint64_t *lwe_sk_out =
-        lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
-
-    auto d_cur_rep_ct_lwe_in_array =
-        &d_lwe_ct_in_array[r * SAMPLES * number_of_inputs *
-                           (input_lwe_dimension + 1)];
-
-    for (uint s = 0; s < SAMPLES; s++) {
-      auto d_cur_sample_ct_lwe_in_array =
-          &d_cur_rep_ct_lwe_in_array[s * number_of_inputs *
-                                     (input_lwe_dimension + 1)];
-      auto d_cur_sample_ct_lwe_out_array =
-          &d_lwe_ct_out_array[s * total_bits_to_extract *
-                              (output_lwe_dimension + 1)];
-      // Execute bit extract
-      auto cur_sample_ct_lwe_out_array =
-          &lwe_ct_out_array[s * total_bits_to_extract *
-                            (output_lwe_dimension + 1)];
-      cuda_extract_bits_64(
-          stream_array[s], gpu_index, (void *)d_cur_sample_ct_lwe_out_array,
-          (void *)d_cur_sample_ct_lwe_in_array, bit_extract_buffer_array[s],
-          (void *)d_ksk, (void *)d_fourier_bsk, number_of_bits_to_extract_array,
-          delta_log_array, input_lwe_dimension, output_lwe_dimension,
-          glwe_dimension, polynomial_size, pbs_base_log, pbs_level, ks_base_log,
-          ks_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index));
-
-      // Copy result back
-      cuda_memcpy_async_to_cpu(
-          cur_sample_ct_lwe_out_array, d_cur_sample_ct_lwe_out_array,
-          (output_lwe_dimension + 1) * total_bits_to_extract * sizeof(uint64_t),
-          stream_array[s], gpu_index);
-    }
-    for (size_t s = 0; s < SAMPLES; s++) {
-      void *v_stream = (void *)stream_array[s];
-      cuda_synchronize_stream(v_stream);
-    }
-    cudaDeviceSynchronize();
-
-    for (size_t s = 0; s < SAMPLES; s++) {
-      auto cur_sample_result_array =
-          &lwe_ct_out_array[s * total_bits_to_extract *
-                            (output_lwe_dimension + 1)];
-      int cur_total_bits = 0;
-      for (int j = 0; j < number_of_inputs; j++) {
-        auto cur_input_result_array =
-            &cur_sample_result_array[cur_total_bits *
-                                     (output_lwe_dimension + 1)];
-        cur_total_bits += number_of_bits_to_extract_array[j];
-        uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
-                                        s * number_of_inputs + j];
-        for (size_t i = 0; i < number_of_bits_to_extract_array[j]; i++) {
-          auto result_ct =
-              &cur_input_result_array[(number_of_bits_to_extract_array[j] - 1 -
-                                       i) *
-                                      (output_lwe_dimension + 1)];
-          uint64_t decrypted_message = 0;
-          concrete_cpu_decrypt_lwe_ciphertext_u64(
-              lwe_sk_out, result_ct, output_lwe_dimension, &decrypted_message);
-          // Round after decryption
-          uint64_t decrypted_rounded =
-              closest_representable(decrypted_message, 1, 1);
-          // Bring back the extracted bit found in the MSB in the LSB
-          uint64_t decrypted_extract_bit = decrypted_rounded >> 63;
-          uint64_t expected =
-              ((plaintext >> delta_log_array[j]) >> i) & (uint64_t)(1);
-          EXPECT_EQ(decrypted_extract_bit, expected);
-        }
-      }
-    }
-  }
-}
-
-// Defines for which parameters set the PBS will be tested.
-// It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<BitExtractionTestParams>
-    bit_extract_params_u64 = ::testing::Values(
-        // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
-        // ks_base_log, ks_level, number_of_message_bits,
-        // number_of_bits_to_extract, number_of_inputs
-        (BitExtractionTestParams){585,
-                                  1,
-                                  1024,
-                                  7.52316384526264e-37,
-                                  7.52316384526264e-37,
-                                  10,
-                                  2,
-                                  4,
-                                  7,
-                                  {5, 4, 4, 3},
-                                  {5, 4, 4, 3},
-                                  4});
-
-std::string
-printParamName(::testing::TestParamInfo<BitExtractionTestParams> p) {
-  BitExtractionTestParams params = p.param;
-
-  return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
-         std::to_string(params.glwe_dimension) + "_N_" +
-         std::to_string(params.polynomial_size) + "_pbs_base_log_" +
-         std::to_string(params.pbs_base_log) + "_pbs_level_" +
-         std::to_string(params.pbs_level) + "_ks_base_log_" +
-         std::to_string(params.ks_base_log) + "_ks_level_" +
-         std::to_string(params.ks_level) + "_number_of_message_bits_" +
-         std::to_string(
-             params.number_of_bits_of_message_including_padding_array[0]) +
-         "_" +
-         std::to_string(
-             params.number_of_bits_of_message_including_padding_array[1]) +
-         "_" +
-         std::to_string(
-             params.number_of_bits_of_message_including_padding_array[2]) +
-         "_" +
-         std::to_string(
-             params.number_of_bits_of_message_including_padding_array[3]) +
-         "_number_of_bits_to_extract_" +
-         std::to_string(params.number_of_bits_to_extract_array[0]) + "_" +
-         std::to_string(params.number_of_bits_to_extract_array[1]) + "_" +
-         std::to_string(params.number_of_bits_to_extract_array[2]) + "_" +
-         std::to_string(params.number_of_bits_to_extract_array[3]) +
-         "_number_of_inputs_" + std::to_string(params.number_of_inputs);
-}
-
-INSTANTIATE_TEST_CASE_P(BitExtractionInstantiation,
-                        BitExtractionTestPrimitives_u64, bit_extract_params_u64,
-                        printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_circuit_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_circuit_bootstrap.cpp
@@ -1,220 +0,0 @@
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-const unsigned REPETITIONS = 2;
-const unsigned SAMPLES = 50;
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int pksk_base_log;
-  int pksk_level;
-  int cbs_base_log;
-  int cbs_level;
-  int number_of_inputs;
-} CircuitBootstrapTestParams;
-
-class CircuitBootstrapTestPrimitives_u64
-    : public ::testing::TestWithParam<CircuitBootstrapTestParams> {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int pksk_base_log;
-  int pksk_level;
-  int cbs_base_log;
-  int cbs_level;
-  int number_of_inputs;
-  int number_of_bits_of_message_including_padding;
-  int ggsw_size;
-  uint64_t delta;
-  int delta_log;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *plaintexts;
-  double *d_fourier_bsk_array;
-  uint64_t *d_pksk_array;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *d_ggsw_ct_out_array;
-  uint64_t *d_lut_vector_indexes;
-  int8_t *cbs_buffer;
-
-public:
-  // Test arithmetic functions
-  void SetUp() {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    lwe_dimension = (int)GetParam().lwe_dimension;
-    glwe_dimension = (int)GetParam().glwe_dimension;
-    polynomial_size = (int)GetParam().polynomial_size;
-    lwe_modular_variance = (double)GetParam().lwe_modular_variance;
-    glwe_modular_variance = (double)GetParam().glwe_modular_variance;
-    pbs_base_log = (int)GetParam().pbs_base_log;
-    pbs_level = (int)GetParam().pbs_level;
-    pksk_base_log = (int)GetParam().pksk_base_log;
-    pksk_level = (int)GetParam().pksk_level;
-    cbs_base_log = (int)GetParam().cbs_base_log;
-    cbs_level = (int)GetParam().cbs_level;
-    number_of_inputs = (int)GetParam().number_of_inputs;
-
-    // We generate binary messages
-    number_of_bits_of_message_including_padding = 2;
-    ggsw_size = cbs_level * (glwe_dimension + 1) * (glwe_dimension + 1) *
-                polynomial_size;
-
-    circuit_bootstrap_setup(
-        stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
-        &d_fourier_bsk_array, &d_pksk_array, &plaintexts, &d_lwe_ct_in_array,
-        &d_ggsw_ct_out_array, &d_lut_vector_indexes, &cbs_buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, lwe_modular_variance,
-        glwe_modular_variance, pksk_base_log, pksk_level, pbs_base_log,
-        pbs_level, cbs_level, number_of_bits_of_message_including_padding,
-        ggsw_size, &delta_log, &delta, number_of_inputs, REPETITIONS, SAMPLES,
-        gpu_index);
-  }
-
-  void TearDown() {
-    circuit_bootstrap_teardown(
-        stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
-        d_pksk_array, plaintexts, d_lwe_ct_in_array, d_lut_vector_indexes,
-        d_ggsw_ct_out_array, cbs_buffer, gpu_index);
-  }
-};
-
-TEST_P(CircuitBootstrapTestPrimitives_u64, circuit_bootstrap) {
-  void *v_stream = (void *)stream;
-  uint64_t *ggsw_ct_out = (uint64_t *)malloc(ggsw_size * sizeof(uint64_t));
-  for (uint r = 0; r < REPETITIONS; r++) {
-    int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
-                   polynomial_size * (lwe_dimension + 1);
-    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
-    int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size *
-                         (glwe_dimension * polynomial_size + 1) *
-                         (glwe_dimension + 1);
-    uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
-    uint64_t *lwe_sk_out =
-        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
-    for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t *d_lwe_ct_in =
-          d_lwe_ct_in_array +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-
-      // Execute circuit bootstrap
-      cuda_circuit_bootstrap_64(
-          stream, gpu_index, (void *)d_ggsw_ct_out_array, (void *)d_lwe_ct_in,
-          (void *)d_fourier_bsk, (void *)d_pksk_list,
-          (void *)d_lut_vector_indexes, cbs_buffer, delta_log, polynomial_size,
-          glwe_dimension, lwe_dimension, pbs_level, pbs_base_log, pksk_level,
-          pksk_base_log, cbs_level, cbs_base_log, number_of_inputs,
-          cuda_get_max_shared_memory(gpu_index));
-
-      for (int i = 0; i < number_of_inputs; i++) {
-        uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
-                                        s * number_of_inputs + i];
-        uint64_t *decrypted =
-            (uint64_t *)malloc(polynomial_size * (glwe_dimension + 1) *
-                               cbs_level * sizeof(uint64_t));
-        // Copy result back
-        cuda_memcpy_async_to_cpu(
-            ggsw_ct_out, d_ggsw_ct_out_array + i * ggsw_size,
-            ggsw_size * sizeof(uint64_t), stream, gpu_index);
-        cuda_synchronize_stream(v_stream);
-
-        uint64_t multiplying_factor = -(plaintext >> delta_log);
-        for (int l = 1; l < cbs_level + 1; l++) {
-          for (int j = 0; j < glwe_dimension; j++) {
-            uint64_t *res = decrypted + (ptrdiff_t)((l - 1) * polynomial_size *
-                                                        (glwe_dimension + 1) +
-                                                    j * polynomial_size);
-            uint64_t *glwe_ct_out =
-                ggsw_ct_out +
-                (ptrdiff_t)((l - 1) * polynomial_size * (glwe_dimension + 1) *
-                                (glwe_dimension + 1) +
-                            j * polynomial_size * (glwe_dimension + 1));
-            concrete_cpu_decrypt_glwe_ciphertext_u64(
-                lwe_sk_out, res, glwe_ct_out, glwe_dimension, polynomial_size);
-
-            for (int k = 0; k < polynomial_size; k++) {
-              uint64_t expected_decryption =
-                  lwe_sk_out[j * polynomial_size + k] * multiplying_factor;
-              expected_decryption >>= (64 - cbs_base_log * l);
-              uint64_t decoded_plaintext =
-                  closest_representable(res[k], l, cbs_base_log) >>
-                  (64 - cbs_base_log * l);
-              EXPECT_EQ(expected_decryption, decoded_plaintext);
-            }
-          }
-        }
-        // Check last glwe on last level
-        uint64_t *res =
-            decrypted + (ptrdiff_t)((cbs_level - 1) * polynomial_size *
-                                        (glwe_dimension + 1) +
-                                    glwe_dimension * polynomial_size);
-        uint64_t *glwe_ct_out =
-            ggsw_ct_out +
-            (ptrdiff_t)((cbs_level - 1) * polynomial_size *
-                            (glwe_dimension + 1) * (glwe_dimension + 1) +
-                        glwe_dimension * polynomial_size *
-                            (glwe_dimension + 1));
-        concrete_cpu_decrypt_glwe_ciphertext_u64(
-            lwe_sk_out, res, glwe_ct_out, glwe_dimension, polynomial_size);
-
-        for (int k = 0; k < polynomial_size; k++) {
-          uint64_t expected_decryption = (k == 0) ? plaintext / delta : 0;
-          uint64_t decoded_plaintext =
-              closest_representable(res[k], cbs_level, cbs_base_log) >>
-              (64 - cbs_base_log * cbs_level);
-          EXPECT_EQ(expected_decryption, decoded_plaintext);
-        }
-        free(decrypted);
-      }
-    }
-  }
-  free(ggsw_ct_out);
-}
-
-// Defines for which parameters set the PBS will be tested.
-// It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<CircuitBootstrapTestParams> cbs_params_u64 =
-    ::testing::Values(
-        // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
-        // pksk_base_log, pksk_level, cbs_base_log, cbs_level, number_of_inputs
-        (CircuitBootstrapTestParams){10, 2, 512, 7.52316384526264e-37,
-                                     7.52316384526264e-37, 11, 2, 15, 2, 10, 1,
-                                     10});
-
-std::string
-printParamName(::testing::TestParamInfo<CircuitBootstrapTestParams> p) {
-  CircuitBootstrapTestParams params = p.param;
-
-  return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
-         std::to_string(params.glwe_dimension) + "_N_" +
-         std::to_string(params.polynomial_size) + "_pbs_base_log_" +
-         std::to_string(params.pbs_base_log) + "_pbs_level_" +
-         std::to_string(params.pbs_level) + "_pksk_base_log_" +
-         std::to_string(params.pksk_base_log) + "_pksk_level_" +
-         std::to_string(params.pksk_level) + "_cbs_base_log_" +
-         std::to_string(params.cbs_base_log) + "_cbs_level_" +
-         std::to_string(params.cbs_level);
-}
-
-INSTANTIATE_TEST_CASE_P(CircuitBootstrapInstantiation,
-                        CircuitBootstrapTestPrimitives_u64, cbs_params_u64,
-                        printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_classical_pbs.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_classical_pbs.cpp
@@ -1,339 +0,0 @@
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <utils.h>
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int message_modulus;
-  int carry_modulus;
-  int number_of_inputs;
-  int repetitions;
-  int samples;
-} ClassicalBootstrapTestParams;
-
-class ClassicalBootstrapTestPrimitives_u64
-    : public ::testing::TestWithParam<ClassicalBootstrapTestParams> {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int message_modulus;
-  int carry_modulus;
-  int payload_modulus;
-  int number_of_inputs;
-  int repetitions;
-  int samples;
-  uint64_t delta;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *plaintexts;
-  double *d_fourier_bsk_array;
-  uint64_t *d_lut_pbs_identity;
-  uint64_t *d_lut_pbs_indexes;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *d_lwe_ct_out_array;
-  uint64_t *lwe_ct_out_array;
-
-public:
-  // Test arithmetic functions
-  void SetUp() {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    lwe_dimension = (int)GetParam().lwe_dimension;
-    glwe_dimension = (int)GetParam().glwe_dimension;
-    polynomial_size = (int)GetParam().polynomial_size;
-    lwe_modular_variance = (double)GetParam().lwe_modular_variance;
-    glwe_modular_variance = (double)GetParam().glwe_modular_variance;
-    pbs_base_log = (int)GetParam().pbs_base_log;
-    pbs_level = (int)GetParam().pbs_level;
-    message_modulus = (int)GetParam().message_modulus;
-    carry_modulus = (int)GetParam().carry_modulus;
-    number_of_inputs = (int)GetParam().number_of_inputs;
-    repetitions = (int)GetParam().repetitions;
-    samples = (int)GetParam().samples;
-
-    bootstrap_classical_setup(
-        stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
-        &d_fourier_bsk_array, &plaintexts, &d_lut_pbs_identity,
-        &d_lut_pbs_indexes, &d_lwe_ct_in_array, &d_lwe_ct_out_array,
-        lwe_dimension, glwe_dimension, polynomial_size, lwe_modular_variance,
-        glwe_modular_variance, pbs_base_log, pbs_level, message_modulus,
-        carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
-        samples, gpu_index);
-
-    lwe_ct_out_array =
-        (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
-                           number_of_inputs * sizeof(uint64_t));
-  }
-
-  void TearDown() {
-    free(lwe_ct_out_array);
-    bootstrap_classical_teardown(
-        stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_fourier_bsk_array,
-        plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
-        d_lwe_ct_out_array, gpu_index);
-  }
-};
-
-TEST_P(ClassicalBootstrapTestPrimitives_u64, amortized_bootstrap) {
-  int8_t *pbs_buffer;
-  scratch_cuda_bootstrap_amortized_64(
-      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
-
-  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
-                 polynomial_size * (lwe_dimension + 1);
-  // Here execute the PBS
-  for (int r = 0; r < repetitions; r++) {
-    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
-    uint64_t *lwe_sk_out =
-        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
-    for (int s = 0; s < samples; s++) {
-      uint64_t *d_lwe_ct_in =
-          d_lwe_ct_in_array +
-          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-      // Execute PBS
-      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-          stream, gpu_index, (void *)d_lwe_ct_out_array,
-          (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-          (void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-          number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
-      // Copy result back
-      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
-                               (glwe_dimension * polynomial_size + 1) *
-                                   number_of_inputs * sizeof(uint64_t),
-                               stream, gpu_index);
-
-      for (int j = 0; j < number_of_inputs; j++) {
-        uint64_t *result =
-            lwe_ct_out_array +
-            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
-        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
-                                        s * number_of_inputs + j];
-        uint64_t decrypted = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
-        EXPECT_NE(decrypted, plaintext);
-        // let err = (decrypted >= plaintext) ? decrypted - plaintext :
-        // plaintext
-        // - decrypted;
-        // error_sample_vec.push(err);
-
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted & rounding_bit) << 1;
-        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, plaintext / delta)
-            << "Repetition: " << r << ", sample: " << s;
-      }
-    }
-  }
-  cleanup_cuda_bootstrap_amortized(stream, gpu_index, &pbs_buffer);
-}
-
-TEST_P(ClassicalBootstrapTestPrimitives_u64, low_latency_bootstrap) {
-  int8_t *pbs_buffer;
-  scratch_cuda_bootstrap_low_latency_64(
-      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
-
-  int number_of_sm = 0;
-  cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
-  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
-                 polynomial_size * (lwe_dimension + 1);
-  // Here execute the PBS
-  for (int r = 0; r < repetitions; r++) {
-    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
-    uint64_t *lwe_sk_out =
-        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
-    for (int s = 0; s < samples; s++) {
-      uint64_t *d_lwe_ct_in =
-          d_lwe_ct_in_array +
-          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-      // Execute PBS
-      cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
-          stream, gpu_index, (void *)d_lwe_ct_out_array,
-          (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-          (void *)d_lwe_ct_in, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-          number_of_inputs, 1, 0, cuda_get_max_shared_memory(gpu_index));
-      // Copy result back
-      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
-                               (glwe_dimension * polynomial_size + 1) *
-                                   number_of_inputs * sizeof(uint64_t),
-                               stream, gpu_index);
-
-      for (int j = 0; j < number_of_inputs; j++) {
-        uint64_t *result =
-            lwe_ct_out_array +
-            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
-        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
-                                        s * number_of_inputs + j];
-        uint64_t decrypted = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
-        EXPECT_NE(decrypted, plaintext);
-        // let err = (decrypted >= plaintext) ? decrypted - plaintext :
-        // plaintext
-        // - decrypted;
-        // error_sample_vec.push(err);
-
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted & rounding_bit) << 1;
-        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, plaintext / delta);
-      }
-    }
-  }
-  cleanup_cuda_bootstrap_low_latency(stream, gpu_index, &pbs_buffer);
-}
-
-// Defines for which parameters set the PBS will be tested.
-// It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<ClassicalBootstrapTestParams>
-    pbs_params_u64 = ::testing::Values(
-        // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
-        // message_modulus, carry_modulus, number_of_inputs, repetitions,
-        // samples
-        // BOOLEAN_DEFAULT_PARAMETERS
-        (ClassicalBootstrapTestParams){777, 3, 512, 1.3880686109937e-11,
-                                       1.1919984450689246e-23, 18, 1, 2, 2, 2,
-                                       2, 40},
-        // BOOLEAN_TFHE_LIB_PARAMETERS
-        (ClassicalBootstrapTestParams){830, 2, 1024, 1.994564705573226e-12,
-                                       8.645717832544903e-32, 23, 1, 2, 2, 2, 2,
-                                       40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_0
-        (ClassicalBootstrapTestParams){678, 5, 256, 5.203010004723453e-10,
-                                       1.3996292326131784e-19, 15, 1, 2, 1, 2,
-                                       2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_1
-        (ClassicalBootstrapTestParams){684, 3, 512, 4.177054989616946e-10,
-                                       1.1919984450689246e-23, 18, 1, 2, 2, 2,
-                                       2, 40},
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_0
-        (ClassicalBootstrapTestParams){656, 2, 512, 1.1641198952558192e-09,
-                                       1.6434266310406663e-15, 8, 2, 4, 1, 2, 2,
-                                       40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_0
-        (ClassicalBootstrapTestParams){742, 2, 1024, 4.998277131225527e-11,
-                                       8.645717832544903e-32, 23, 1, 2, 4, 2, 2,
-                                       40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_0
-        (ClassicalBootstrapTestParams){745, 1, 2048, 4.478453795193731e-11,
-                                       8.645717832544903e-32, 23, 1, 2, 8, 2, 2,
-                                       40},
-        // SHORTINT_PARAM_MESSAGE_5_CARRY_0
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_2
-        (ClassicalBootstrapTestParams){807, 1, 4096, 4.629015039118823e-12,
-                                       4.70197740328915e-38, 22, 1, 32, 1, 2, 1,
-                                       40},
-        // SHORTINT_PARAM_MESSAGE_6_CARRY_0
-        (ClassicalBootstrapTestParams){915, 1, 8192, 8.883173851180252e-14,
-                                       4.70197740328915e-38, 22, 1, 64, 1, 2, 1,
-                                       5},
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_3
-        (ClassicalBootstrapTestParams){864, 1, 8192, 1.5843564961097632e-15,
-                                       4.70197740328915e-38, 15, 2, 8, 8, 2, 1,
-                                       5},
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_7_CARRY_0
-        (ClassicalBootstrapTestParams){930, 1, 16384, 5.129877458078009e-14,
-                                       4.70197740328915e-38, 15, 2, 128, 1, 2,
-                                       1, 5},
-
-        // BOOLEAN_DEFAULT_PARAMETERS
-        (ClassicalBootstrapTestParams){777, 3, 512, 1.3880686109937e-11,
-                                       1.1919984450689246e-23, 18, 1, 2, 2, 100,
-                                       2, 40},
-        // BOOLEAN_TFHE_LIB_PARAMETERS
-        (ClassicalBootstrapTestParams){830, 2, 1024, 1.994564705573226e-12,
-                                       8.645717832544903e-32, 23, 1, 2, 2, 100,
-                                       2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_0
-        (ClassicalBootstrapTestParams){678, 5, 256, 5.203010004723453e-10,
-                                       1.3996292326131784e-19, 15, 1, 2, 1, 100,
-                                       2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_1
-        (ClassicalBootstrapTestParams){684, 3, 512, 4.177054989616946e-10,
-                                       1.1919984450689246e-23, 18, 1, 2, 2, 100,
-                                       2, 40},
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_0
-        (ClassicalBootstrapTestParams){656, 2, 512, 1.1641198952558192e-09,
-                                       1.6434266310406663e-15, 8, 2, 4, 1, 100,
-                                       2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_0
-        (ClassicalBootstrapTestParams){742, 2, 1024, 4.998277131225527e-11,
-                                       8.645717832544903e-32, 23, 1, 2, 4, 100,
-                                       2, 40},
-        // SHORTINT_PARAM_MESSAGE_1_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_1
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_0
-        (ClassicalBootstrapTestParams){745, 1, 2048, 4.478453795193731e-11,
-                                       8.645717832544903e-32, 23, 1, 2, 8, 100,
-                                       2, 40},
-        // SHORTINT_PARAM_MESSAGE_5_CARRY_0
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_2
-        (ClassicalBootstrapTestParams){807, 1, 4096, 4.629015039118823e-12,
-                                       4.70197740328915e-38, 22, 1, 32, 1, 100,
-                                       1, 40},
-        // SHORTINT_PARAM_MESSAGE_6_CARRY_0
-        (ClassicalBootstrapTestParams){915, 1, 8192, 8.883173851180252e-14,
-                                       4.70197740328915e-38, 22, 1, 64, 1, 100,
-                                       1, 5},
-        // SHORTINT_PARAM_MESSAGE_3_CARRY_3
-        (ClassicalBootstrapTestParams){864, 1, 8192, 1.5843564961097632e-15,
-                                       4.70197740328915e-38, 15, 2, 8, 8, 100,
-                                       1, 5},
-        // SHORTINT_PARAM_MESSAGE_4_CARRY_3
-        // SHORTINT_PARAM_MESSAGE_7_CARRY_0
-        (ClassicalBootstrapTestParams){930, 1, 16384, 5.129877458078009e-14,
-                                       4.70197740328915e-38, 15, 2, 128, 1, 100,
-                                       1, 5});
-std::string
-printParamName(::testing::TestParamInfo<ClassicalBootstrapTestParams> p) {
-  ClassicalBootstrapTestParams params = p.param;
-
-  return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
-         std::to_string(params.glwe_dimension) + "_N_" +
-         std::to_string(params.polynomial_size) + "_pbs_base_log_" +
-         std::to_string(params.pbs_base_log) + "_pbs_level_" +
-         std::to_string(params.pbs_level) + "_number_of_inputs_" +
-         std::to_string(params.number_of_inputs);
-}
-
-INSTANTIATE_TEST_CASE_P(ClassicalBootstrapInstantiation,
-                        ClassicalBootstrapTestPrimitives_u64, pbs_params_u64,
-                        printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_cmux_tree.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_cmux_tree.cpp
@@ -1,149 +0,0 @@
-#include <cmath>
-#include <cstdint>
-#include <functional>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <stdlib.h>
-
-const unsigned REPETITIONS = 5;
-const unsigned SAMPLES = 50;
-
-typedef struct {
-  int glwe_dimension;
-  int polynomial_size;
-  int p; // number_of_bits_to_extract
-  int tau;
-  double glwe_modular_variance;
-  int base_log;
-  int level_count;
-} CMUXTreeTestParams;
-
-class CMUXTreeTestPrimitives_u64
-    : public ::testing::TestWithParam<CMUXTreeTestParams> {
-protected:
-  int glwe_dimension;
-  int polynomial_size;
-  int p;
-  int tau;
-  double glwe_modular_variance;
-  int base_log;
-  int level_count;
-  uint64_t delta;
-  uint32_t delta_log;
-  Csprng *csprng;
-  uint64_t *plaintexts;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *glwe_sk;
-  uint64_t *d_lut_identity;
-  int8_t *cmux_tree_buffer = nullptr;
-  uint64_t *d_ggsw_bit_array;
-  uint64_t *d_glwe_out;
-  uint64_t *glwe_out;
-
-public:
-  // Test arithmetic functions
-  void SetUp() {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    glwe_dimension = (int)GetParam().glwe_dimension;
-    polynomial_size = (int)GetParam().polynomial_size;
-    p = (int)GetParam().p;
-    tau = (int)GetParam().tau;
-    glwe_modular_variance = (int)GetParam().glwe_modular_variance;
-    base_log = (int)GetParam().base_log;
-    level_count = (int)GetParam().level_count;
-
-    cmux_tree_setup(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
-                    &d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
-                    glwe_dimension, polynomial_size, base_log, level_count,
-                    glwe_modular_variance, p, tau, &delta_log, REPETITIONS,
-                    SAMPLES, gpu_index);
-
-    // Value of the shift we multiply our messages by
-    delta = ((uint64_t)(1) << delta_log);
-
-    glwe_out = (uint64_t *)malloc(tau * (glwe_dimension + 1) * polynomial_size *
-                                  sizeof(uint64_t));
-  }
-
-  void TearDown() {
-    free(glwe_out);
-    cmux_tree_teardown(stream, &csprng, &glwe_sk, &d_lut_identity, &plaintexts,
-                       &d_ggsw_bit_array, &cmux_tree_buffer, &d_glwe_out,
-                       gpu_index);
-  }
-};
-
-TEST_P(CMUXTreeTestPrimitives_u64, cmux_tree) {
-  int ggsw_size = polynomial_size * (glwe_dimension + 1) *
-                  (glwe_dimension + 1) * level_count;
-  int glwe_size = (glwe_dimension + 1) * polynomial_size;
-  uint32_t r_lut = 1;
-  if (tau * p > log2(polynomial_size)) {
-    r_lut = tau * p - log2(polynomial_size);
-  }
-
-  // Here execute the PBS
-  for (uint r = 0; r < REPETITIONS; r++) {
-    for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t witness = plaintexts[r * SAMPLES + s];
-
-      uint64_t *d_ggsw_bit_array_slice =
-          d_ggsw_bit_array +
-          (ptrdiff_t)((r * SAMPLES * r_lut + s * r_lut) * ggsw_size);
-
-      // Execute CMUX tree
-      cuda_cmux_tree_64(stream, gpu_index, (void *)d_glwe_out,
-                        (void *)d_ggsw_bit_array_slice, (void *)d_lut_identity,
-                        cmux_tree_buffer, glwe_dimension, polynomial_size,
-                        base_log, level_count, (1 << (tau * p)), tau,
-                        cuda_get_max_shared_memory(gpu_index));
-
-      // Copy result back
-      cuda_memcpy_async_to_cpu(glwe_out, d_glwe_out,
-                               tau * glwe_size * sizeof(uint64_t), stream,
-                               gpu_index);
-      cuda_synchronize_stream(stream);
-      for (int tree = 0; tree < tau; tree++) {
-        uint64_t *result = glwe_out + tree * glwe_size;
-        uint64_t *decrypted =
-            (uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
-        concrete_cpu_decrypt_glwe_ciphertext_u64(
-            glwe_sk, decrypted, result, glwe_dimension, polynomial_size);
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted[0] & rounding_bit) << 1;
-        uint64_t decoded = (decrypted[0] + rounding) / delta;
-        EXPECT_EQ(decoded, witness % (1 << p))
-            << "Repetition: " << r << ", sample: " << s << ", tree: " << tree;
-        free(decrypted);
-      }
-    }
-  }
-  cuda_synchronize_stream(stream);
-}
-
-// Defines for which parameters set the PBS will be tested.
-// It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<CMUXTreeTestParams> cmux_tree_params_u64 =
-    ::testing::Values(
-        // k, N, p, tau, glwe_variance, base_log, level_count
-        (CMUXTreeTestParams){2, 256, 3, 4, 2.9403601535432533e-16, 6, 3},
-        (CMUXTreeTestParams){2, 512, 4, 2, 2.9403601535432533e-16, 6, 3},
-        (CMUXTreeTestParams){1, 1024, 11, 1, 2.9403601535432533e-16, 6, 3});
-
-std::string printParamName(::testing::TestParamInfo<CMUXTreeTestParams> p) {
-  CMUXTreeTestParams params = p.param;
-
-  return "k_" + std::to_string(params.glwe_dimension) + "_N_" +
-         std::to_string(params.polynomial_size) + "_tau_" +
-         std::to_string(params.tau) + "_p_" + std::to_string(params.p) +
-         "_base_log_" + std::to_string(params.base_log) + "_level_count_" +
-         std::to_string(params.level_count);
-}
-
-INSTANTIATE_TEST_CASE_P(CMUXTreeInstantiation, CMUXTreeTestPrimitives_u64,
-                        cmux_tree_params_u64, printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_fft.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_fft.cpp
@@ -1,129 +0,0 @@
-#include "concrete-cpu.h"
-#include "utils.h"
-#include "gtest/gtest.h"
-#include <bootstrap.h>
-#include <cstdint>
-#include <device.h>
-#include <functional>
-#include <random>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-typedef struct {
-  size_t polynomial_size;
-  int samples;
-} FourierTransformTestParams;
-
-class FourierTransformTestPrimitives_u64
-    : public ::testing::TestWithParam<FourierTransformTestParams> {
-protected:
-  size_t polynomial_size;
-  int samples;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-
-  double *poly1;
-  double *poly2; // will be used as extracted result for cuda mult
-  double *poly_exp_result;
-  double2 *h_cpoly1;
-  double2 *h_cpoly2; // will be used as a result poly
-  double2 *d_cpoly1;
-  double2 *d_cpoly2; // will be used as a result poly
-
-public:
-  void SetUp() {
-    stream = cuda_create_stream(0);
-
-    // get test params
-    polynomial_size = (int)GetParam().polynomial_size;
-    samples = (int)GetParam().samples;
-
-    fft_setup(stream, &poly1, &poly2, &h_cpoly1, &h_cpoly2, &d_cpoly1,
-              &d_cpoly2, polynomial_size, samples, gpu_index);
-
-    // allocate memory
-    poly_exp_result =
-        (double *)malloc(polynomial_size * 2 * samples * sizeof(double));
-    memset(poly_exp_result, 0., polynomial_size * 2 * samples * sizeof(double));
-
-    // execute school book multiplication
-    for (size_t p = 0; p < (size_t)samples; p++) {
-      auto left = &poly1[p * polynomial_size];
-      auto right = &poly2[p * polynomial_size];
-      auto res = &poly_exp_result[p * polynomial_size * 2];
-
-      // multiplication
-      for (std::size_t i = 0; i < polynomial_size; ++i) {
-        for (std::size_t j = 0; j < polynomial_size; ++j) {
-          res[i + j] += left[i] * right[j];
-        }
-      }
-
-      // make result negacyclic
-      for (size_t i = 0; i < polynomial_size; i++) {
-        res[i] = res[i] - res[i + polynomial_size];
-      }
-    }
-  }
-
-  void TearDown() {
-    fft_teardown(stream, poly1, poly2, h_cpoly1, h_cpoly2, d_cpoly1, d_cpoly2,
-                 gpu_index);
-    free(poly_exp_result);
-  }
-};
-
-TEST_P(FourierTransformTestPrimitives_u64, cuda_fft_mult) {
-
-  int r = 0;
-  auto cur_input1 = &d_cpoly1[r * polynomial_size / 2 * samples];
-  auto cur_input2 = &d_cpoly2[r * polynomial_size / 2 * samples];
-  auto cur_h_c_res = &h_cpoly2[r * polynomial_size / 2 * samples];
-  auto cur_poly2 = &poly2[r * polynomial_size * samples];
-  auto cur_expected = &poly_exp_result[r * polynomial_size * 2 * samples];
-
-  cuda_fourier_polynomial_mul(cur_input1, cur_input2, cur_input2, stream, 0,
-                              polynomial_size, samples);
-
-  cuda_memcpy_async_to_cpu(cur_h_c_res, cur_input2,
-                           polynomial_size / 2 * samples * sizeof(double2),
-                           stream, gpu_index);
-  cuda_synchronize_stream(stream);
-
-  for (int p = 0; p < samples; p++) {
-    for (size_t i = 0; i < (size_t)polynomial_size / 2; i++) {
-      cur_poly2[p * polynomial_size + i] =
-          cur_h_c_res[p * polynomial_size / 2 + i].x;
-      cur_poly2[p * polynomial_size + i + polynomial_size / 2] =
-          cur_h_c_res[p * polynomial_size / 2 + i].y;
-    }
-  }
-
-  for (size_t p = 0; p < (size_t)samples; p++) {
-    for (size_t i = 0; i < (size_t)polynomial_size; i++) {
-      EXPECT_NEAR(cur_poly2[p * polynomial_size + i],
-                  cur_expected[p * 2 * polynomial_size + i], 1e-9);
-    }
-  }
-}
-
-::testing::internal::ParamGenerator<FourierTransformTestParams> fft_params_u64 =
-    ::testing::Values((FourierTransformTestParams){256, 100},
-                      (FourierTransformTestParams){512, 100},
-                      (FourierTransformTestParams){1024, 100},
-                      (FourierTransformTestParams){2048, 100},
-                      (FourierTransformTestParams){4096, 100},
-                      (FourierTransformTestParams){8192, 50},
-                      (FourierTransformTestParams){16384, 10});
-
-std::string
-printParamName(::testing::TestParamInfo<FourierTransformTestParams> p) {
-  FourierTransformTestParams params = p.param;
-
-  return "N_" + std::to_string(params.polynomial_size) + "_samples_" +
-         std::to_string(params.samples);
-}
-
-INSTANTIATE_TEST_CASE_P(fftInstantiation, FourierTransformTestPrimitives_u64,
-                        fft_params_u64, printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_integer_multiplication.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_integer_multiplication.cpp
@@ -1,266 +0,0 @@
-#include <cmath>
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <utils.h>
-const bool USE_MULTI_GPU = false;
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int ksk_base_log;
-  int ksk_level;
-  int total_message_bits;
-  int number_of_blocks;
-  int message_modulus;
-  int carry_modulus;
-  int repetitions;
-  int samples;
-  PBS_TYPE pbs_type;
-} IntegerMultiplicationTestParams;
-
-class IntegerMultiplicationTestPrimitives_u64
-    : public ::testing::TestWithParam<IntegerMultiplicationTestParams> {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int ksk_base_log;
-  int ksk_level;
-  int total_message_bits;
-  int number_of_blocks;
-  int message_modulus;
-  int carry_modulus;
-  int repetitions;
-  int samples;
-  PBS_TYPE pbs_type;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t delta;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *plaintexts_1;
-  uint64_t *plaintexts_2;
-  uint64_t *expected;
-  void *d_bsk_array;
-  uint64_t *d_ksk_array;
-  uint64_t *d_lwe_ct_in_array_1;
-  uint64_t *d_lwe_ct_in_array_2;
-  uint64_t *d_lwe_ct_out_array;
-  int_mul_memory<uint64_t> *mem_ptr;
-
-public:
-  // Test arithmetic functions
-  void SetUp() {
-    // TestParams
-    lwe_dimension = (int)GetParam().lwe_dimension;
-    glwe_dimension = (int)GetParam().glwe_dimension;
-    polynomial_size = (int)GetParam().polynomial_size;
-    lwe_modular_variance = (double)GetParam().lwe_modular_variance;
-    glwe_modular_variance = (double)GetParam().glwe_modular_variance;
-    pbs_base_log = (int)GetParam().pbs_base_log;
-    pbs_level = (int)GetParam().pbs_level;
-    ksk_base_log = (int)GetParam().ksk_base_log;
-    ksk_level = (int)GetParam().ksk_level;
-    total_message_bits = (int)GetParam().total_message_bits;
-    number_of_blocks = (int)GetParam().number_of_blocks;
-    message_modulus = (int)GetParam().message_modulus;
-    carry_modulus = (int)GetParam().carry_modulus;
-    repetitions = (int)GetParam().repetitions;
-    samples = (int)GetParam().samples;
-    pbs_type = (PBS_TYPE)GetParam().pbs_type;
-    mem_ptr = new int_mul_memory<uint64_t>;
-    stream = cuda_create_stream(gpu_index);
-
-    integer_multiplication_setup(
-        stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk_array,
-        &d_ksk_array, &plaintexts_1, &plaintexts_2, &d_lwe_ct_in_array_1,
-        &d_lwe_ct_in_array_2, &d_lwe_ct_out_array, mem_ptr, lwe_dimension,
-        glwe_dimension, polynomial_size, lwe_modular_variance,
-        glwe_modular_variance, pbs_base_log, pbs_level, ksk_base_log, ksk_level,
-        total_message_bits, number_of_blocks, message_modulus, carry_modulus,
-        &delta, repetitions, samples, pbs_type, gpu_index);
-
-    expected = (uint64_t *)malloc(repetitions * samples * number_of_blocks *
-                                  sizeof(uint64_t));
-    for (int r = 0; r < repetitions; r++) {
-      for (int s = 0; s < samples; s++) {
-        uint64_t message_1 = 0;
-        uint64_t message_2 = 0;
-        for (int i = 0; i < number_of_blocks; i++) {
-          message_1 += std::pow(message_modulus, i) *
-                       plaintexts_1[r * samples * number_of_blocks +
-                                    s * number_of_blocks + i] /
-                       delta;
-          message_2 += std::pow(message_modulus, i) *
-                       plaintexts_2[r * samples * number_of_blocks +
-                                    s * number_of_blocks + i] /
-                       delta;
-        }
-        uint64_t expected_result =
-            (message_1 * message_2) % (1 << total_message_bits);
-        for (int i = number_of_blocks - 1; i >= 0; i--) {
-          uint64_t coef = expected_result / std::pow(message_modulus, i);
-          expected[i] = coef;
-          expected_result -= coef * std::pow(message_modulus, i);
-        }
-      }
-    }
-  }
-
-  void TearDown() {
-    free(expected);
-    integer_multiplication_teardown(
-        stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array,
-        d_ksk_array, plaintexts_1, plaintexts_2, d_lwe_ct_in_array_1,
-        d_lwe_ct_in_array_2, d_lwe_ct_out_array, mem_ptr);
-    cuda_synchronize_stream(stream);
-    cuda_destroy_stream(stream, gpu_index);
-  }
-};
-
-TEST_P(IntegerMultiplicationTestPrimitives_u64, integer_multiplication) {
-
-  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
-                 polynomial_size * (lwe_dimension + 1);
-  int ksk_size =
-      ksk_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size;
-
-  uint64_t *lwe_ct_out_array =
-      (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
-                         number_of_blocks * sizeof(uint64_t));
-  uint64_t *decrypted = (uint64_t *)malloc(number_of_blocks * sizeof(uint64_t));
-  for (int r = 0; r < repetitions; r++) {
-    void *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
-    uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
-    uint64_t *lwe_sk =
-        lwe_sk_in_array + (ptrdiff_t)(glwe_dimension * polynomial_size * r);
-    for (int s = 0; s < samples; s++) {
-      uint64_t *d_lwe_ct_in_1 =
-          d_lwe_ct_in_array_1 +
-          (ptrdiff_t)((r * samples * number_of_blocks + s * number_of_blocks) *
-                      (glwe_dimension * polynomial_size + 1));
-      uint64_t *d_lwe_ct_in_2 =
-          d_lwe_ct_in_array_2 +
-          (ptrdiff_t)((r * samples * number_of_blocks + s * number_of_blocks) *
-                      (glwe_dimension * polynomial_size + 1));
-      uint32_t ct_degree_out = 0;
-      uint32_t ct_degree_left = 0;
-      uint32_t ct_degree_right = 0;
-      int8_t *mult_buffer = NULL;
-      // Execute integer mult
-      if (USE_MULTI_GPU) {
-        scratch_cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
-            mem_ptr, d_bsk, d_ksk, message_modulus, carry_modulus,
-            glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log,
-            pbs_level, ksk_base_log, ksk_level, number_of_blocks, pbs_type,
-            cuda_get_max_shared_memory(gpu_index), true);
-
-        cuda_integer_mult_radix_ciphertext_kb_64_multi_gpu(
-            d_lwe_ct_out_array, d_lwe_ct_in_1, d_lwe_ct_in_2, &ct_degree_out,
-            &ct_degree_left, &ct_degree_right, d_bsk, d_ksk, (void *)mem_ptr,
-            message_modulus, carry_modulus, glwe_dimension, lwe_dimension,
-            polynomial_size, pbs_base_log, pbs_level, ksk_base_log, ksk_level,
-            number_of_blocks, pbs_type, cuda_get_max_shared_memory(gpu_index));
-
-      } else {
-        scratch_cuda_integer_mult_radix_ciphertext_kb_64(
-            stream, gpu_index, (void *)mem_ptr, message_modulus, carry_modulus,
-            glwe_dimension, lwe_dimension, polynomial_size, pbs_base_log,
-            pbs_level, ksk_base_log, ksk_level, number_of_blocks, pbs_type,
-            cuda_get_max_shared_memory(gpu_index), true);
-
-        cuda_integer_mult_radix_ciphertext_kb_64(
-            stream, gpu_index, d_lwe_ct_out_array, d_lwe_ct_in_1, d_lwe_ct_in_2,
-            &ct_degree_out, &ct_degree_left, &ct_degree_right, d_bsk, d_ksk,
-            (void *)mem_ptr, message_modulus, carry_modulus, glwe_dimension,
-            lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-            ksk_base_log, ksk_level, number_of_blocks, pbs_type,
-            cuda_get_max_shared_memory(gpu_index));
-      }
-
-      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
-                               (glwe_dimension * polynomial_size + 1) *
-                                   number_of_blocks * sizeof(uint64_t),
-                               stream, gpu_index);
-
-      // Process result
-      decrypt_integer_u64_blocks(lwe_ct_out_array, lwe_sk, &decrypted,
-                                 glwe_dimension * polynomial_size,
-                                 number_of_blocks, delta, message_modulus);
-      for (int i = 0; i < number_of_blocks; i++) {
-        ASSERT_EQ(decrypted[i], expected[i])
-            << "Repetition: " << r << ", sample: " << s;
-      }
-    }
-  }
-  free(lwe_ct_out_array);
-  free(decrypted);
-}
-
-// Defines for which parameters set the PBS will be tested.
-// It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<IntegerMultiplicationTestParams>
-    integer_mult_params_u64 = ::testing::Values(
-        // n, k, N, lwe_variance, glwe_variance, pbs_base_log, pbs_level,
-        // ksk_base_log, ksk_level,
-        // total_message_bits, number_of_blocks, message_modulus,
-        // carry_modulus, repetitions, samples
-        // SHORTINT_PARAM_MESSAGE_2_CARRY_2
-        // The total number of bits of message should not exceed 64 to be
-        // able to use a uint64_t representation for the result calculation
-        // in clear
-        (IntegerMultiplicationTestParams){744, 1, 2048, 4.478453795193731e-11,
-                                          8.645717832544903e-32, 23, 1, 3, 5, 4,
-                                          2, 4, 4, 1, 1, MULTI_BIT},
-        (IntegerMultiplicationTestParams){744, 1, 2048, 4.478453795193731e-11,
-                                          8.645717832544903e-32, 23, 1, 3, 5, 4,
-                                          2, 4, 4, 1, 1, LOW_LAT},
-        (IntegerMultiplicationTestParams){744, 1, 2048, 4.478453795193731e-11,
-                                          8.645717832544903e-32, 23, 1, 3, 5, 4,
-                                          2, 4, 4, 1, 1, AMORTIZED});
-std::string
-printParamName(::testing::TestParamInfo<IntegerMultiplicationTestParams> p) {
-  IntegerMultiplicationTestParams params = p.param;
-
-  const char *pbs_type;
-  switch(params.pbs_type){
-      case 0:
-          pbs_type = "MULTIBIT";
-          break;
-      case 1:
-          pbs_type = "LOW_LAT";
-          break;
-      case 2:
-          pbs_type = "AMORTIZED";
-          break;
-      default:
-          pbs_type = "Unknown";
-  }
-  return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
-         std::to_string(params.glwe_dimension) + "_N_" +
-         std::to_string(params.polynomial_size) + "_pbs_base_log_" +
-         std::to_string(params.pbs_base_log) + "_pbs_level_" +
-         std::to_string(params.pbs_level) + "_number_of_blocks_" +
-         std::to_string(params.number_of_blocks) + "_message_modulus_" +
-         std::to_string(params.message_modulus) + "_carry_modulus_" +
-         std::to_string(params.carry_modulus) + "_" +
-         pbs_type;
-}
-
-INSTANTIATE_TEST_CASE_P(IntegerMultiplicationInstantiation,
-                        IntegerMultiplicationTestPrimitives_u64,
-                        integer_mult_params_u64, printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_keyswitch.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_keyswitch.cpp
@@ -1,149 +0,0 @@
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-const unsigned REPETITIONS = 2;
-const unsigned SAMPLES = 50;
-
-typedef struct {
-  int input_lwe_dimension;
-  int output_lwe_dimension;
-  double noise_variance;
-  int ksk_base_log;
-  int ksk_level;
-  int message_modulus;
-  int carry_modulus;
-  int number_of_inputs;
-} KeyswitchTestParams;
-
-class KeyswitchTestPrimitives_u64
-    : public ::testing::TestWithParam<KeyswitchTestParams> {
-protected:
-  int input_lwe_dimension;
-  int output_lwe_dimension;
-  double noise_variance;
-  int ksk_base_log;
-  int ksk_level;
-  int message_modulus;
-  int carry_modulus;
-  int number_of_inputs;
-  int payload_modulus;
-  uint64_t delta;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *plaintexts;
-  uint64_t *d_ksk_array;
-  uint64_t *d_lwe_ct_out_array;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *lwe_in_ct;
-  uint64_t *lwe_out_ct;
-
-public:
-  // Test arithmetic functions
-  void SetUp() {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    input_lwe_dimension = (int)GetParam().input_lwe_dimension;
-    output_lwe_dimension = (int)GetParam().output_lwe_dimension;
-    noise_variance = (double)GetParam().noise_variance;
-    ksk_base_log = (int)GetParam().ksk_base_log;
-    ksk_level = (int)GetParam().ksk_level;
-    message_modulus = (int)GetParam().message_modulus;
-    carry_modulus = (int)GetParam().carry_modulus;
-    number_of_inputs = (int)GetParam().number_of_inputs;
-
-    keyswitch_setup(stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array,
-                    &d_ksk_array, &plaintexts, &d_lwe_ct_in_array,
-                    &d_lwe_ct_out_array, input_lwe_dimension,
-                    output_lwe_dimension, noise_variance, ksk_base_log,
-                    ksk_level, message_modulus, carry_modulus, &payload_modulus,
-                    &delta, number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
-  }
-
-  void TearDown() {
-    keyswitch_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
-                       d_ksk_array, plaintexts, d_lwe_ct_in_array,
-                       d_lwe_ct_out_array, gpu_index);
-  }
-};
-
-TEST_P(KeyswitchTestPrimitives_u64, keyswitch) {
-  uint64_t *lwe_out_ct = (uint64_t *)malloc(
-      (output_lwe_dimension + 1) * number_of_inputs * sizeof(uint64_t));
-  for (uint r = 0; r < REPETITIONS; r++) {
-    uint64_t *lwe_out_sk =
-        lwe_sk_out_array + (ptrdiff_t)(r * output_lwe_dimension);
-    int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
-    uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
-    for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t *d_lwe_ct_in =
-          d_lwe_ct_in_array +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
-                      (input_lwe_dimension + 1));
-      // Execute keyswitch
-      cuda_keyswitch_lwe_ciphertext_vector_64(
-          stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
-          (void *)d_ksk, input_lwe_dimension, output_lwe_dimension,
-          ksk_base_log, ksk_level, number_of_inputs);
-
-      // Copy result back
-      cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_ct_out_array,
-                               number_of_inputs * (output_lwe_dimension + 1) *
-                                   sizeof(uint64_t),
-                               stream, gpu_index);
-      for (int i = 0; i < number_of_inputs; i++) {
-        uint64_t plaintext = plaintexts[r * SAMPLES * number_of_inputs +
-                                        s * number_of_inputs + i];
-        uint64_t decrypted = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_out_sk, lwe_out_ct + i * (output_lwe_dimension + 1),
-            output_lwe_dimension, &decrypted);
-        EXPECT_NE(decrypted, plaintext);
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted & rounding_bit) << 1;
-        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, plaintext / delta);
-      }
-    }
-  }
-  free(lwe_out_ct);
-}
-
-// Defines for which parameters set the PBS will be tested.
-// It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<KeyswitchTestParams> ksk_params_u64 =
-    ::testing::Values(
-        // n, k*N, noise_variance, ks_base_log, ks_level,
-        // message_modulus, carry_modulus, number_of_inputs
-        (KeyswitchTestParams){567, 1280, 2.9802322387695312e-18, 3, 3, 2, 1,
-                              10},
-        (KeyswitchTestParams){694, 1536, 2.9802322387695312e-18, 4, 3, 2, 1,
-                              10},
-        (KeyswitchTestParams){769, 2048, 2.9802322387695312e-18, 4, 3, 2, 1,
-                              10},
-        (KeyswitchTestParams){754, 2048, 2.9802322387695312e-18, 3, 5, 2, 1,
-                              10},
-        (KeyswitchTestParams){847, 4096, 2.9802322387695312e-18, 4, 4, 2, 1,
-                              10},
-        (KeyswitchTestParams){881, 8192, 2.9802322387695312e-18, 3, 6, 2, 1,
-                              10});
-
-std::string printParamName(::testing::TestParamInfo<KeyswitchTestParams> p) {
-  KeyswitchTestParams params = p.param;
-
-  return "na_" + std::to_string(params.input_lwe_dimension) + "_nb_" +
-         std::to_string(params.output_lwe_dimension) + "_baselog_" +
-         std::to_string(params.ksk_base_log) + "_ksk_level_" +
-         std::to_string(params.ksk_level);
-}
-
-INSTANTIATE_TEST_CASE_P(KeyswitchInstantiation, KeyswitchTestPrimitives_u64,
-                        ksk_params_u64, printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_linear_algebra.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_linear_algebra.cpp
@@ -1,269 +0,0 @@
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-const unsigned REPETITIONS = 5;
-const unsigned SAMPLES = 100;
-
-typedef struct {
-  int lwe_dimension;
-  double noise_variance;
-  int message_modulus;
-  int carry_modulus;
-  int number_of_inputs;
-} LinearAlgebraTestParams;
-
-class LinearAlgebraTestPrimitives_u64
-    : public ::testing::TestWithParam<LinearAlgebraTestParams> {
-protected:
-  int lwe_dimension;
-  double noise_variance;
-  int message_modulus;
-  int carry_modulus;
-  int number_of_inputs;
-  int payload_modulus;
-  uint64_t delta;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *lwe_sk_array;
-  uint64_t *d_lwe_in_1_ct;
-  uint64_t *d_lwe_in_2_ct;
-  uint64_t *d_plaintext_2;
-  uint64_t *d_cleartext;
-  uint64_t *d_lwe_out_ct;
-  uint64_t *lwe_in_1_ct;
-  uint64_t *lwe_in_2_ct;
-  uint64_t *lwe_out_ct;
-  uint64_t *plaintexts_1;
-  uint64_t *plaintexts_2;
-  int num_samples;
-
-public:
-  // Test arithmetic functions
-  void SetUp() {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    lwe_dimension = (int)GetParam().lwe_dimension;
-    noise_variance = (double)GetParam().noise_variance;
-    message_modulus = (int)GetParam().message_modulus;
-    carry_modulus = (int)GetParam().carry_modulus;
-    number_of_inputs = (int)GetParam().number_of_inputs;
-
-    payload_modulus = message_modulus * carry_modulus;
-    // Value of the shift we multiply our messages by
-    // In this test we use a smaller delta to avoid an overflow during
-    // multiplication
-    delta =
-        ((uint64_t)(1) << 63) / (uint64_t)(payload_modulus * payload_modulus);
-
-    linear_algebra_setup(stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct,
-                         &d_lwe_in_2_ct, &d_lwe_out_ct, &lwe_in_1_ct,
-                         &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
-                         &plaintexts_2, &d_plaintext_2, &d_cleartext,
-                         lwe_dimension, noise_variance, payload_modulus, delta,
-                         number_of_inputs, REPETITIONS, SAMPLES, gpu_index);
-  }
-
-  void TearDown() {
-    linear_algebra_teardown(
-        stream, &csprng, &lwe_sk_array, &d_lwe_in_1_ct, &d_lwe_in_2_ct,
-        &d_lwe_out_ct, &lwe_in_1_ct, &lwe_in_2_ct, &lwe_out_ct, &plaintexts_1,
-        &plaintexts_2, &d_plaintext_2, &d_cleartext, gpu_index);
-  }
-};
-
-TEST_P(LinearAlgebraTestPrimitives_u64, addition) {
-  void *v_stream = (void *)stream;
-  // Here execute the PBS
-  for (uint r = 0; r < REPETITIONS; r++) {
-    uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
-    for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t *d_lwe_1_in =
-          d_lwe_in_1_ct +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-      uint64_t *d_lwe_2_in =
-          d_lwe_in_2_ct +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-      // Execute addition
-      cuda_add_lwe_ciphertext_vector_64(stream, gpu_index, (void *)d_lwe_out_ct,
-                                        (void *)d_lwe_1_in, (void *)d_lwe_2_in,
-                                        lwe_dimension, number_of_inputs);
-
-      // Copy result back
-      cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                               number_of_inputs * (lwe_dimension + 1) *
-                                   sizeof(uint64_t),
-                               stream, gpu_index);
-      cuda_synchronize_stream(v_stream);
-      for (int i = 0; i < number_of_inputs; i++) {
-        uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
-                                            s * number_of_inputs + i];
-        uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
-                                            s * number_of_inputs + i];
-        uint64_t decrypted = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
-            &decrypted);
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted & rounding_bit) << 1;
-        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta)
-            << "Repetition: " << r << ", sample: " << s;
-      }
-    }
-  }
-}
-
-TEST_P(LinearAlgebraTestPrimitives_u64, plaintext_addition) {
-  // Here execute the PBS
-  for (uint r = 0; r < REPETITIONS; r++) {
-    uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
-    for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t *d_lwe_1_slice =
-          d_lwe_in_1_ct +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-      uint64_t *d_plaintext_2_in =
-          d_plaintext_2 +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs));
-      // Execute addition
-      cuda_add_lwe_ciphertext_vector_plaintext_vector_64(
-          stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
-          (void *)d_plaintext_2_in, lwe_dimension, number_of_inputs);
-      // Copy result back
-      cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                               number_of_inputs * (lwe_dimension + 1) *
-                                   sizeof(uint64_t),
-                               stream, gpu_index);
-      for (int i = 0; i < number_of_inputs; i++) {
-        uint64_t plaintext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
-                                            s * number_of_inputs + i];
-        uint64_t plaintext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
-                                            s * number_of_inputs + i];
-        uint64_t decrypted = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
-            &decrypted);
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted & rounding_bit) << 1;
-        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, (plaintext_1 + plaintext_2) / delta)
-            << "Repetition: " << r << ", sample: " << s << " i: " << i << ") "
-            << plaintext_1 / delta << " + " << plaintext_2 / delta;
-      }
-    }
-  }
-}
-
-TEST_P(LinearAlgebraTestPrimitives_u64, cleartext_multiplication) {
-  void *v_stream = (void *)stream;
-  // Here execute the PBS
-  for (uint r = 0; r < REPETITIONS; r++) {
-    uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
-    for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t *d_lwe_1_slice =
-          d_lwe_in_1_ct +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-      uint64_t *d_cleartext_in =
-          d_cleartext +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs));
-      // Execute cleartext multiplication
-      cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
-          stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
-          (void *)d_cleartext_in, lwe_dimension, number_of_inputs);
-      // Copy result back
-      cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                               number_of_inputs * (lwe_dimension + 1) *
-                                   sizeof(uint64_t),
-                               stream, gpu_index);
-      cuda_synchronize_stream(v_stream);
-      for (int i = 0; i < number_of_inputs; i++) {
-        uint64_t cleartext_1 = plaintexts_1[r * SAMPLES * number_of_inputs +
-                                            s * number_of_inputs + i] /
-                               delta;
-        uint64_t cleartext_2 = plaintexts_2[r * SAMPLES * number_of_inputs +
-                                            s * number_of_inputs + i] /
-                               delta;
-        uint64_t decrypted = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
-            &decrypted);
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted & rounding_bit) << 1;
-        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, cleartext_1 * cleartext_2)
-            << "Repetition: " << r << ", sample: " << s << " i: " << i
-            << ", decrypted: " << decrypted;
-      }
-    }
-  }
-}
-
-TEST_P(LinearAlgebraTestPrimitives_u64, negate) {
-  // Here execute the PBS
-  for (uint r = 0; r < REPETITIONS; r++) {
-    uint64_t *lwe_sk = lwe_sk_array + (ptrdiff_t)(r * lwe_dimension);
-    for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t *d_lwe_1_slice =
-          d_lwe_in_1_ct +
-          (ptrdiff_t)((r * SAMPLES * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-      // Execute negate
-      cuda_negate_lwe_ciphertext_vector_64(
-          stream, gpu_index, (void *)d_lwe_out_ct, (void *)d_lwe_1_slice,
-          lwe_dimension, number_of_inputs);
-
-      // Copy result back
-      cuda_memcpy_async_to_cpu(lwe_out_ct, d_lwe_out_ct,
-                               number_of_inputs * (lwe_dimension + 1) *
-                                   sizeof(uint64_t),
-                               stream, gpu_index);
-      for (int i = 0; i < number_of_inputs; i++) {
-        uint64_t plaintext = plaintexts_1[r * SAMPLES * number_of_inputs +
-                                          s * number_of_inputs + i];
-        uint64_t decrypted = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_sk, lwe_out_ct + i * (lwe_dimension + 1), lwe_dimension,
-            &decrypted);
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted & rounding_bit) << 1;
-        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, -plaintext / delta)
-            << "Repetition: " << r << ", sample: " << s << " i: " << i;
-      }
-    }
-  }
-}
-
-// Defines for which parameters set the linear algebra operations will be
-// tested. It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<LinearAlgebraTestParams>
-    linear_algebra_params_u64 = ::testing::Values(
-        // n, lwe_std_dev, message_modulus, carry_modulus, number_of_inputs
-        (LinearAlgebraTestParams){600, 7.52316384526264e-37, 2, 2, 10});
-
-std::string
-printParamName(::testing::TestParamInfo<LinearAlgebraTestParams> p) {
-  LinearAlgebraTestParams params = p.param;
-
-  return "n_" + std::to_string(params.lwe_dimension);
-}
-
-INSTANTIATE_TEST_CASE_P(LinearAlgebraInstantiation,
-                        LinearAlgebraTestPrimitives_u64,
-                        linear_algebra_params_u64, printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_multibit_pbs.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_multibit_pbs.cpp
@@ -1,211 +0,0 @@
-#include <cstdint>
-#include <cstdio>
-#include <cstdlib>
-#include <functional>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <utils.h>
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int message_modulus;
-  int carry_modulus;
-  int number_of_inputs;
-  int grouping_factor;
-  int repetitions;
-  int samples;
-} MultiBitBootstrapTestParams;
-
-class MultiBitBootstrapTestPrimitives_u64
-    : public ::testing::TestWithParam<MultiBitBootstrapTestParams> {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int message_modulus;
-  int carry_modulus;
-  int payload_modulus;
-  int number_of_inputs;
-  int grouping_factor;
-  uint64_t delta;
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *plaintexts;
-  uint64_t *d_bsk_array;
-  uint64_t *d_lut_pbs_identity;
-  uint64_t *d_lut_pbs_indexes;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *d_lwe_ct_out_array;
-  uint64_t *lwe_ct_out_array;
-  int8_t *pbs_buffer;
-
-  int repetitions;
-  int samples;
-
-public:
-  void SetUp() {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    lwe_dimension = (int)GetParam().lwe_dimension;
-    glwe_dimension = (int)GetParam().glwe_dimension;
-    polynomial_size = (int)GetParam().polynomial_size;
-    grouping_factor = (int)GetParam().grouping_factor;
-    lwe_modular_variance = (double)GetParam().lwe_modular_variance;
-    glwe_modular_variance = (double)GetParam().glwe_modular_variance;
-    pbs_base_log = (int)GetParam().pbs_base_log;
-    pbs_level = (int)GetParam().pbs_level;
-    message_modulus = (int)GetParam().message_modulus;
-    carry_modulus = (int)GetParam().carry_modulus;
-    number_of_inputs = (int)GetParam().number_of_inputs;
-
-    repetitions = (int)GetParam().repetitions;
-    samples = (int)GetParam().samples;
-
-    bootstrap_multibit_setup(
-        stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array, &d_bsk_array,
-        &plaintexts, &d_lut_pbs_identity, &d_lut_pbs_indexes,
-        &d_lwe_ct_in_array, &d_lwe_ct_out_array, &pbs_buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, grouping_factor, lwe_modular_variance,
-        glwe_modular_variance, pbs_base_log, pbs_level, message_modulus,
-        carry_modulus, &payload_modulus, &delta, number_of_inputs, repetitions,
-        samples, gpu_index);
-
-    lwe_ct_out_array =
-        (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
-                           number_of_inputs * sizeof(uint64_t));
-  }
-
-  void TearDown() {
-    free(lwe_ct_out_array);
-    bootstrap_multibit_teardown(
-        stream, csprng, lwe_sk_in_array, lwe_sk_out_array, d_bsk_array,
-        plaintexts, d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
-        d_lwe_ct_out_array, &pbs_buffer, gpu_index);
-  }
-};
-
-TEST_P(MultiBitBootstrapTestPrimitives_u64, multi_bit_pbs) {
-
-  int bsk_size = (lwe_dimension / grouping_factor) * pbs_level *
-                 (glwe_dimension + 1) * (glwe_dimension + 1) * polynomial_size *
-                 (1 << grouping_factor);
-
-  for (int r = 0; r < repetitions; r++) {
-    uint64_t *d_bsk = d_bsk_array + (ptrdiff_t)(bsk_size * r);
-    uint64_t *lwe_sk_out =
-        lwe_sk_out_array + (ptrdiff_t)(r * glwe_dimension * polynomial_size);
-    for (int s = 0; s < samples; s++) {
-      uint64_t *d_lwe_ct_in =
-          d_lwe_ct_in_array +
-          (ptrdiff_t)((r * samples * number_of_inputs + s * number_of_inputs) *
-                      (lwe_dimension + 1));
-      // Execute PBS
-      cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
-          stream, gpu_index, (void *)d_lwe_ct_out_array,
-          (void *)d_lut_pbs_identity, (void *)d_lut_pbs_indexes,
-          (void *)d_lwe_ct_in, (void *)d_bsk, pbs_buffer, lwe_dimension,
-          glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-          pbs_level, number_of_inputs, 1, 0,
-          cuda_get_max_shared_memory(gpu_index));
-
-      // Copy result to the host memory
-      cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
-                               (glwe_dimension * polynomial_size + 1) *
-                                   number_of_inputs * sizeof(uint64_t),
-                               stream, gpu_index);
-
-      for (int j = 0; j < number_of_inputs; j++) {
-        uint64_t *result =
-            lwe_ct_out_array +
-            (ptrdiff_t)(j * (glwe_dimension * polynomial_size + 1));
-        uint64_t plaintext = plaintexts[r * samples * number_of_inputs +
-                                        s * number_of_inputs + j];
-        uint64_t decrypted = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_sk_out, result, glwe_dimension * polynomial_size, &decrypted);
-
-        EXPECT_NE(decrypted, plaintext)
-            << "Repetition: " << r << ", sample: " << s << ", input: " << j;
-
-        // The bit before the message
-        uint64_t rounding_bit = delta >> 1;
-
-        // Compute the rounding bit
-        uint64_t rounding = (decrypted & rounding_bit) << 1;
-        uint64_t decoded = (decrypted + rounding) / delta;
-        EXPECT_EQ(decoded, plaintext / delta)
-            << "Repetition: " << r << ", sample: " << s << ", input: " << j;
-      }
-    }
-  }
-  // cleanup_cuda_multi_bit_pbs(stream, gpu_index, &pbs_buffer);
-}
-
-// Defines for which parameters set the PBS will be tested.
-// It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<MultiBitBootstrapTestParams>
-    multipbs_params_u64 = ::testing::Values(
-        // fast test
-        (MultiBitBootstrapTestParams){16, 1, 256, 1.3880686109937e-11,
-                                      1.1919984450689246e-23, 23, 1, 2, 2, 1, 2,
-                                      1, 2},
-        (MultiBitBootstrapTestParams){16, 1, 256, 1.3880686109937e-11,
-                                      1.1919984450689246e-23, 23, 1, 2, 2, 128,
-                                      2, 1, 2},
-        // 4_bits_multi_bit_group_2
-        (MultiBitBootstrapTestParams){818, 1, 2048, 1.3880686109937e-11,
-                                      1.1919984450689246e-23, 22, 1, 2, 2, 1, 2,
-                                      1, 1},
-        (MultiBitBootstrapTestParams){818, 1, 2048, 1.3880686109937e-15,
-                                      1.1919984450689246e-24, 22, 1, 2, 2, 128,
-                                      2, 1, 1},
-        // 4_bits_multi_bit_group_3
-        (MultiBitBootstrapTestParams){888, 1, 2048, 4.9571231961752025e-12,
-                                      9.9409770026944e-32, 21, 1, 2, 2, 1, 3, 1,
-                                      1},
-        (MultiBitBootstrapTestParams){888, 1, 2048, 4.9571231961752025e-12,
-                                      9.9409770026944e-32, 21, 1, 2, 2, 128, 3,
-                                      1, 1},
-
-        (MultiBitBootstrapTestParams){742, 1, 2048, 4.9571231961752025e-12,
-                                      9.9409770026944e-32, 23, 1, 2, 2, 128, 2,
-                                      1, 1},
-        (MultiBitBootstrapTestParams){744, 1, 2048, 4.9571231961752025e-12,
-                                      9.9409770026944e-32, 23, 1, 2, 2, 1, 3, 1,
-                                      1},
-        (MultiBitBootstrapTestParams){744, 1, 2048, 4.9571231961752025e-12,
-                                      9.9409770026944e-32, 23, 1, 2, 2, 5, 3, 1,
-                                      1},
-        (MultiBitBootstrapTestParams){744, 1, 2048, 4.9571231961752025e-12,
-                                      9.9409770026944e-32, 23, 1, 2, 2, 128, 3,
-                                      1, 1});
-std::string
-printParamName(::testing::TestParamInfo<MultiBitBootstrapTestParams> p) {
-  MultiBitBootstrapTestParams params = p.param;
-
-  return "n_" + std::to_string(params.lwe_dimension) + "_k_" +
-         std::to_string(params.glwe_dimension) + "_N_" +
-         std::to_string(params.polynomial_size) + "_pbs_base_log_" +
-         std::to_string(params.pbs_base_log) + "_pbs_level_" +
-         std::to_string(params.pbs_level) + "_grouping_factor_" +
-         std::to_string(params.grouping_factor) + "_number_of_inputs_" +
-         std::to_string(params.number_of_inputs);
-}
-
-INSTANTIATE_TEST_CASE_P(MultiBitBootstrapInstantiation,
-                        MultiBitBootstrapTestPrimitives_u64,
-                        multipbs_params_u64, printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/test/test_wop_bootstrap.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/test/test_wop_bootstrap.cpp
@@ -1,256 +0,0 @@
-#include <cstdint>
-#include <gtest/gtest.h>
-#include <setup_and_teardown.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-const unsigned REPETITIONS = 2;
-const unsigned SAMPLES = 10;
-const unsigned MAX_TAU = 4;
-
-typedef struct {
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int ks_base_log;
-  int ks_level;
-  int pksk_base_log;
-  int pksk_level;
-  int cbs_base_log;
-  int cbs_level;
-  int tau;
-  int p_array[MAX_TAU];
-} WopBootstrapTestParams;
-
-class WopBootstrapTestPrimitives_u64
-    : public ::testing::TestWithParam<WopBootstrapTestParams> {
-protected:
-  int lwe_dimension;
-  int glwe_dimension;
-  int polynomial_size;
-  double lwe_modular_variance;
-  double glwe_modular_variance;
-  int pbs_base_log;
-  int pbs_level;
-  int ks_base_log;
-  int ks_level;
-  int pksk_base_log;
-  int pksk_level;
-  int cbs_base_log;
-  int cbs_level;
-  int tau;
-  uint32_t p_array[MAX_TAU];
-  uint64_t delta_array[MAX_TAU];
-  int cbs_delta_log;
-  uint32_t delta_log_array[MAX_TAU];
-  Csprng *csprng;
-  cudaStream_t *stream;
-  int gpu_index = 0;
-  uint64_t *lwe_sk_in_array;
-  uint64_t *lwe_sk_out_array;
-  uint64_t *lwe_in_ct_array;
-  uint64_t *lwe_out_ct_array;
-  uint64_t *plaintexts;
-  double *d_fourier_bsk_array;
-  uint64_t *d_ksk_array;
-  uint64_t *d_pksk_array;
-  uint64_t *d_lwe_ct_in_array;
-  uint64_t *d_lwe_ct_out_array;
-  uint64_t *d_lut_vector;
-  int8_t *wop_pbs_buffer;
-  int input_lwe_dimension;
-
-public:
-  // Test arithmetic functions
-  void SetUp() {
-    stream = cuda_create_stream(0);
-
-    // TestParams
-    lwe_dimension = (int)GetParam().lwe_dimension;
-    glwe_dimension = (int)GetParam().glwe_dimension;
-    polynomial_size = (int)GetParam().polynomial_size;
-    lwe_modular_variance = (double)GetParam().lwe_modular_variance;
-    glwe_modular_variance = (double)GetParam().glwe_modular_variance;
-    pbs_base_log = (int)GetParam().pbs_base_log;
-    pbs_level = (int)GetParam().pbs_level;
-    ks_base_log = (int)GetParam().ks_base_log;
-    ks_level = (int)GetParam().ks_level;
-    pksk_base_log = (int)GetParam().pksk_base_log;
-    pksk_level = (int)GetParam().pksk_level;
-    cbs_base_log = (int)GetParam().cbs_base_log;
-    cbs_level = (int)GetParam().cbs_level;
-    tau = (int)GetParam().tau;
-    for (int i = 0; i < tau; i++) {
-      p_array[i] = (int)GetParam().p_array[i];
-    }
-
-    input_lwe_dimension = glwe_dimension * polynomial_size;
-
-    wop_pbs_setup(
-        stream, &csprng, &lwe_sk_in_array, &lwe_sk_out_array, &d_ksk_array,
-        &d_fourier_bsk_array, &d_pksk_array, &plaintexts, &d_lwe_ct_in_array,
-        &d_lwe_ct_out_array, &d_lut_vector, &wop_pbs_buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, lwe_modular_variance,
-        glwe_modular_variance, ks_base_log, ks_level, pksk_base_log, pksk_level,
-        pbs_base_log, pbs_level, cbs_level, p_array, delta_log_array,
-        &cbs_delta_log, delta_array, tau, REPETITIONS, SAMPLES, gpu_index);
-  }
-
-  void TearDown() {
-    wop_pbs_teardown(stream, csprng, lwe_sk_in_array, lwe_sk_out_array,
-                     d_ksk_array, d_fourier_bsk_array, d_pksk_array, plaintexts,
-                     d_lwe_ct_in_array, d_lut_vector, d_lwe_ct_out_array,
-                     wop_pbs_buffer, gpu_index);
-  }
-};
-
-TEST_P(WopBootstrapTestPrimitives_u64, wop_pbs) {
-  void *v_stream = (void *)stream;
-  uint64_t *lwe_out_ct_array =
-      (uint64_t *)malloc((input_lwe_dimension + 1) * tau * sizeof(uint64_t));
-  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
-                 polynomial_size * (lwe_dimension + 1);
-  int ksk_size =
-      ks_level * (lwe_dimension + 1) * glwe_dimension * polynomial_size;
-  int pksk_list_size = pksk_level * (glwe_dimension + 1) * polynomial_size *
-                       (glwe_dimension * polynomial_size + 1) *
-                       (glwe_dimension + 1);
-  for (uint r = 0; r < REPETITIONS; r++) {
-    double *d_fourier_bsk = d_fourier_bsk_array + (ptrdiff_t)(bsk_size * r);
-    uint64_t *d_ksk = d_ksk_array + (ptrdiff_t)(ksk_size * r);
-    uint64_t *d_pksk_list = d_pksk_array + (ptrdiff_t)(pksk_list_size * r);
-    uint64_t *lwe_sk_in =
-        lwe_sk_in_array + (ptrdiff_t)(input_lwe_dimension * r);
-    for (uint s = 0; s < SAMPLES; s++) {
-      uint64_t *d_lwe_ct_in =
-          d_lwe_ct_in_array + (ptrdiff_t)((r * SAMPLES * tau + s * tau) *
-                                          (input_lwe_dimension + 1));
-
-      // Execute wop pbs
-      cuda_wop_pbs_64(
-          stream, gpu_index, (void *)d_lwe_ct_out_array, (void *)d_lwe_ct_in,
-          (void *)d_lut_vector, (void *)d_fourier_bsk, (void *)d_ksk,
-          (void *)d_pksk_list, wop_pbs_buffer, cbs_delta_log, glwe_dimension,
-          lwe_dimension, polynomial_size, pbs_base_log, pbs_level, ks_base_log,
-          ks_level, pksk_base_log, pksk_level, cbs_base_log, cbs_level, p_array,
-          delta_log_array, tau, cuda_get_max_shared_memory(gpu_index));
-
-      //// Copy result back
-      cuda_memcpy_async_to_cpu(lwe_out_ct_array, d_lwe_ct_out_array,
-                               (input_lwe_dimension + 1) * tau *
-                                   sizeof(uint64_t),
-                               stream, gpu_index);
-      cuda_synchronize_stream(v_stream);
-
-      for (int i = 0; i < tau; i++) {
-        uint64_t plaintext = plaintexts[r * SAMPLES * tau + s * tau + i];
-        uint64_t *result_ct =
-            lwe_out_ct_array + (ptrdiff_t)(i * (input_lwe_dimension + 1));
-        uint64_t decrypted_message = 0;
-        concrete_cpu_decrypt_lwe_ciphertext_u64(
-            lwe_sk_in, result_ct, input_lwe_dimension, &decrypted_message);
-        // Round after decryption
-        uint64_t decrypted =
-            closest_representable(decrypted_message, 1, p_array[i]) >>
-            delta_log_array[i];
-        uint64_t expected = plaintext >> delta_log_array[i];
-        EXPECT_EQ(decrypted, expected)
-            << " failed at tau " << i << ", repetition " << r
-            << ","
-               "sample "
-            << s;
-      }
-    }
-  }
-}
-
-// Defines for which parameters set the PBS will be tested.
-// It executes each test for all pairs on phis X qs (Cartesian product)
-::testing::internal::ParamGenerator<WopBootstrapTestParams> wop_pbs_params_u64 =
-    ::testing::Values(
-
-        // lwe_dimension, glwe_dimension, polynomial_size, lwe_modular_variance,
-        // glwe_modular_variance, pbs_base_log, pbs_level, ks_base_log,
-        // ks_level, pksk_base_log, pksk_level, cbs_base_log, cbs_level, tau, p
-        (WopBootstrapTestParams){481,
-                                 2,
-                                 512,
-                                 7.52316384526264e-37,
-                                 7.52316384526264e-37,
-                                 4,
-                                 9,
-                                 1,
-                                 9,
-                                 4,
-                                 9,
-                                 6,
-                                 4,
-                                 1,
-                                 {11}}, // Full Wop-PBS
-        (WopBootstrapTestParams){481,
-                                 2,
-                                 512,
-                                 7.52316384526264e-37,
-                                 7.52316384526264e-37,
-                                 4,
-                                 9,
-                                 1,
-                                 9,
-                                 4,
-                                 9,
-                                 6,
-                                 4,
-                                 1,
-                                 {9}}, // No CMUX tree
-        (WopBootstrapTestParams){481,
-                                 1,
-                                 1024,
-                                 7.52316384526264e-37,
-                                 7.52316384526264e-37,
-                                 4,
-                                 9,
-                                 1,
-                                 9,
-                                 4,
-                                 9,
-                                 6,
-                                 4,
-                                 1,
-                                 {9}});
-
-std::string printParamName(::testing::TestParamInfo<WopBootstrapTestParams> p) {
-  WopBootstrapTestParams params = p.param;
-  uint32_t lut_vector_size = (1 << (params.p_array[0] * params.tau));
-  std::string message = "Unknown_parameter_set";
-  if ((uint32_t)params.polynomial_size < lut_vector_size) {
-    // We have a cmux tree done with a single cmux.
-    message = "wop_pbs_full_n_" + std::to_string(params.lwe_dimension) + "_k_" +
-              std::to_string(params.glwe_dimension) + "_N_" +
-              std::to_string(params.polynomial_size) + "_tau_" +
-              std::to_string(params.tau) + "_p_" +
-              std::to_string(params.p_array[0]);
-  } else if ((uint32_t)params.polynomial_size == lut_vector_size) {
-    // the VP skips the cmux tree.
-    message =
-        "wop_pbs_without_cmux_tree_n_" + std::to_string(params.lwe_dimension) +
-        "_k_" + std::to_string(params.glwe_dimension) + "_N_" +
-        std::to_string(params.polynomial_size) + "_tau_" +
-        std::to_string(params.tau) + "_p_" + std::to_string(params.p_array[0]);
-  } else {
-    // the VP skips the cmux tree and expands the lut.
-    message = "wop_pbs_expanded_lut_n_" + std::to_string(params.lwe_dimension) +
-              "_k_" + std::to_string(params.glwe_dimension) + "_N_" +
-              std::to_string(params.polynomial_size) + "_tau_" +
-              std::to_string(params.tau) + "_p_" +
-              std::to_string(params.p_array[0]);
-  }
-  return message;
-}
-
-INSTANTIATE_TEST_CASE_P(WopBootstrapInstantiation,
-                        WopBootstrapTestPrimitives_u64, wop_pbs_params_u64,
-                        printParamName);
--- a/backends/concrete-cuda/implementation/test_and_benchmark/utils.cpp
+++ b/backends/concrete-cuda/implementation/test_and_benchmark/utils.cpp
@@ -1,410 +0,0 @@
-#include <algorithm>
-#include <bootstrap.h>
-#include <bootstrap_multibit.h>
-#include <cmath>
-#include <concrete-cpu.h>
-#include <cstdint>
-#include <cstdlib>
-#include <device.h>
-#include <functional>
-#include <random>
-#include <utils.h>
-
-double get_aws_cost_per_second() { return AWS_VM_COST_PER_HOUR / 3600; }
-
-// For each sample and repetition, create a plaintext
-// The payload_modulus is the message modulus times the carry modulus
-// (so the total message modulus)
-uint64_t *generate_plaintexts(uint64_t payload_modulus, uint64_t delta,
-                              int number_of_inputs, const unsigned repetitions,
-                              const unsigned samples) {
-
-  uint64_t *plaintext_array = (uint64_t *)malloc(
-      repetitions * samples * number_of_inputs * sizeof(uint64_t));
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<unsigned long long> dis(
-      std::numeric_limits<std::uint64_t>::min(),
-      std::numeric_limits<std::uint64_t>::max());
-  for (uint r = 0; r < repetitions; r++) {
-    for (uint s = 0; s < samples; s++) {
-      for (int i = 0; i < number_of_inputs; i++) {
-        plaintext_array[r * samples * number_of_inputs + s * number_of_inputs +
-                        i] = (dis(gen) % payload_modulus) * delta;
-      }
-    }
-  }
-  return plaintext_array;
-}
-
-// For each sample and repetition, create a plaintext for bit extract,
-// The payload_modulus is the message modulus times the carry modulus
-// (so the total message modulus)
-uint64_t *generate_plaintexts_bit_extract(uint64_t *payload_modulus,
-                                          uint64_t *delta,
-                                          int crt_decomposition_size,
-                                          const unsigned repetitions,
-                                          const unsigned samples) {
-
-  uint64_t *plaintext_array = (uint64_t *)malloc(
-      repetitions * samples * crt_decomposition_size * sizeof(uint64_t));
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<unsigned long long> dis(
-      std::numeric_limits<std::uint64_t>::min(),
-      std::numeric_limits<std::uint64_t>::max());
-  for (size_t i = 0; i < crt_decomposition_size * repetitions * samples; i++) {
-    plaintext_array[i] =
-        (dis(gen) % payload_modulus[i % crt_decomposition_size]) *
-        delta[i % crt_decomposition_size];
-  }
-  return plaintext_array;
-}
-
-// Decompose value in r bits
-// Bit decomposition of the value from MSB to LSB
-uint64_t *bit_decompose_value(uint64_t value, int r) {
-  uint64_t *bit_array = (uint64_t *)malloc(r * sizeof(uint64_t));
-
-  uint64_t x = value;
-  for (int i = 0; i < r; i++) {
-    bit_array[i] = x & 1;
-    x >>= 1;
-  }
-  return bit_array;
-}
-
-uint64_t *generate_identity_lut_pbs(int polynomial_size, int glwe_dimension,
-                                    int message_modulus, int carry_modulus,
-                                    std::function<uint64_t(uint64_t)> func) {
-  // Modulus of the msg contained in the msg bits and operations buffer
-  uint64_t modulus_sup = message_modulus * carry_modulus;
-
-  // N/(p/2) = size of each block
-  uint64_t box_size = polynomial_size / modulus_sup;
-
-  // Value of the shift we multiply our messages by
-  uint64_t delta = ((uint64_t)1 << 63) / (uint64_t)(modulus_sup);
-
-  // Create the plaintext lut_pbs
-  uint64_t *plaintext_lut_pbs =
-      (uint64_t *)malloc(polynomial_size * sizeof(uint64_t));
-
-  // This plaintext_lut_pbs extracts the carry bits
-  for (uint64_t i = 0; i < modulus_sup; i++) {
-    uint64_t index = i * box_size;
-    for (uint64_t j = index; j < index + box_size; j++) {
-      plaintext_lut_pbs[j] = func(i) * delta;
-    }
-  }
-
-  uint64_t half_box_size = box_size / 2;
-
-  // Negate the first half_box_size coefficients
-  for (uint64_t i = 0; i < half_box_size; i++) {
-    plaintext_lut_pbs[i] = -plaintext_lut_pbs[i];
-  }
-
-  // Rotate the plaintext_lut_pbs
-  std::rotate(plaintext_lut_pbs, plaintext_lut_pbs + half_box_size,
-              plaintext_lut_pbs + polynomial_size);
-
-  // Create the GLWE lut_pbs
-  uint64_t *lut_pbs = (uint64_t *)malloc(
-      polynomial_size * (glwe_dimension + 1) * sizeof(uint64_t));
-  for (int i = 0; i < polynomial_size * glwe_dimension; i++) {
-    lut_pbs[i] = 0;
-  }
-  for (int i = 0; i < polynomial_size; i++) {
-    int glwe_index = glwe_dimension * polynomial_size + i;
-    lut_pbs[glwe_index] = plaintext_lut_pbs[i];
-  }
-
-  free(plaintext_lut_pbs);
-  return lut_pbs;
-}
-
-uint64_t *generate_identity_lut_cmux_tree(int polynomial_size, int lut_size,
-                                          int tau, int delta_log) {
-  int r = 1;
-  if (log2(lut_size) > log2(polynomial_size)) {
-    r = log2(lut_size) - log2(polynomial_size);
-  }
-  uint64_t num_lut = (1 << r);
-  // Create the plaintext lut_pbs
-  uint64_t *plaintext_lut_cmux_tree =
-      (uint64_t *)malloc(num_lut * tau * polynomial_size * sizeof(uint64_t));
-
-  // This plaintext_lut_cmux_tree extracts the carry bits
-  for (int tree = 0; tree < tau; tree++)
-    for (uint64_t i = 0; i < num_lut; i++) {
-      uint64_t *plaintext_lut_slice = plaintext_lut_cmux_tree +
-                                      i * polynomial_size +
-                                      tree * num_lut * polynomial_size;
-      uint64_t coeff =
-          (((uint64_t)(i + tree * num_lut) % (1 << (64 - delta_log))))
-          << delta_log;
-      for (int p = 0; p < polynomial_size; p++)
-        plaintext_lut_slice[p] = coeff;
-    }
-
-  return plaintext_lut_cmux_tree;
-}
-
-// Generate repetitions LWE secret keys
-void generate_lwe_secret_keys(uint64_t **lwe_sk_array, int lwe_dimension,
-                              Csprng *csprng, const unsigned repetitions) {
-  *lwe_sk_array =
-      (uint64_t *)malloc(lwe_dimension * repetitions * sizeof(uint64_t));
-  int shift = 0;
-  for (uint r = 0; r < repetitions; r++) {
-    // Generate the lwe secret key for each repetition
-    concrete_cpu_init_secret_key_u64(*lwe_sk_array + (ptrdiff_t)(shift),
-                                     lwe_dimension, csprng,
-                                     &CONCRETE_CSPRNG_VTABLE);
-    shift += lwe_dimension;
-  }
-}
-
-// Generate repetitions GLWE secret keys
-void generate_glwe_secret_keys(uint64_t **glwe_sk_array, int glwe_dimension,
-                               int polynomial_size, Csprng *csprng,
-                               const unsigned repetitions) {
-  int glwe_sk_array_size = glwe_dimension * polynomial_size * repetitions;
-  *glwe_sk_array = (uint64_t *)malloc(glwe_sk_array_size * sizeof(uint64_t));
-  int shift = 0;
-  for (uint r = 0; r < repetitions; r++) {
-    // Generate the lwe secret key for each repetition
-    concrete_cpu_init_secret_key_u64(*glwe_sk_array + (ptrdiff_t)(shift),
-                                     glwe_dimension * polynomial_size, csprng,
-                                     &CONCRETE_CSPRNG_VTABLE);
-    shift += glwe_dimension * polynomial_size;
-  }
-}
-
-// Generate repetitions LWE bootstrap keys
-void generate_lwe_bootstrap_keys(
-    cudaStream_t *stream, int gpu_index, double **d_fourier_bsk_array,
-    uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension,
-    int glwe_dimension, int polynomial_size, int pbs_level, int pbs_base_log,
-    Csprng *csprng, double variance, const unsigned repetitions) {
-  void *v_stream = (void *)stream;
-  int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
-                 polynomial_size * (lwe_dimension + 1);
-  int bsk_array_size = bsk_size * repetitions;
-
-  uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
-  *d_fourier_bsk_array = (double *)cuda_malloc_async(
-      bsk_array_size * sizeof(double), stream, gpu_index);
-  int shift_in = 0;
-  int shift_out = 0;
-  int shift_bsk = 0;
-
-  for (uint r = 0; r < repetitions; r++) {
-    // Generate the bootstrap key for each repetition
-    concrete_cpu_init_lwe_bootstrap_key_u64(
-        bsk_array + (ptrdiff_t)(shift_bsk),
-        lwe_sk_in_array + (ptrdiff_t)(shift_in),
-        lwe_sk_out_array + (ptrdiff_t)(shift_out), lwe_dimension,
-        polynomial_size, glwe_dimension, pbs_level, pbs_base_log, variance,
-        Parallelism(1), csprng, &CONCRETE_CSPRNG_VTABLE);
-    double *d_fourier_bsk = *d_fourier_bsk_array + (ptrdiff_t)(shift_bsk);
-    uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
-    cuda_synchronize_stream(v_stream);
-    cuda_convert_lwe_bootstrap_key_64(
-        (void *)(d_fourier_bsk), (void *)(bsk), v_stream, gpu_index,
-        lwe_dimension, glwe_dimension, pbs_level, polynomial_size);
-    shift_in += lwe_dimension;
-    shift_out += glwe_dimension * polynomial_size;
-    shift_bsk += bsk_size;
-  }
-  cuda_synchronize_stream(v_stream);
-  free(bsk_array);
-}
-
-void generate_lwe_multi_bit_pbs_keys(
-    cudaStream_t *stream, int gpu_index, uint64_t **d_bsk_array,
-    uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array, int lwe_dimension,
-    int glwe_dimension, int polynomial_size, int grouping_factor, int pbs_level,
-    int pbs_base_log, Csprng *csprng, double variance,
-    const unsigned repetitions) {
-
-  void *v_stream = (void *)stream;
-
-  int bsk_size = lwe_dimension * pbs_level * (glwe_dimension + 1) *
-                 (glwe_dimension + 1) * polynomial_size *
-                 (1 << grouping_factor) / grouping_factor;
-  int bsk_array_size = bsk_size * repetitions;
-  uint64_t *bsk_array = (uint64_t *)malloc(bsk_array_size * sizeof(uint64_t));
-
-  *d_bsk_array = (uint64_t *)cuda_malloc_async(
-      bsk_array_size * sizeof(uint64_t), stream, gpu_index);
-  for (uint r = 0; r < repetitions; r++) {
-    int shift_in = 0;
-    int shift_out = 0;
-    int shift_bsk = 0;
-    core_crypto_par_generate_lwe_multi_bit_bootstrapping_key(
-        lwe_sk_in_array + (ptrdiff_t)(shift_in), lwe_dimension,
-        lwe_sk_out_array + (ptrdiff_t)(shift_out), glwe_dimension,
-        polynomial_size, bsk_array + (ptrdiff_t)(shift_bsk), pbs_base_log,
-        pbs_level, grouping_factor, sqrt(variance), 0, 0);
-    uint64_t *d_bsk = *d_bsk_array + (ptrdiff_t)(shift_bsk);
-    uint64_t *bsk = bsk_array + (ptrdiff_t)(shift_bsk);
-    cuda_convert_lwe_multi_bit_bootstrap_key_64(
-        d_bsk, bsk, stream, gpu_index, lwe_dimension, glwe_dimension, pbs_level,
-        polynomial_size, grouping_factor);
-    shift_in += lwe_dimension;
-    shift_out += glwe_dimension * polynomial_size;
-    shift_bsk += bsk_size;
-  }
-  cuda_synchronize_stream(v_stream);
-  free(bsk_array);
-}
-
-// Generate repetitions keyswitch keys
-void generate_lwe_keyswitch_keys(cudaStream_t *stream, int gpu_index,
-                                 uint64_t **d_ksk_array,
-                                 uint64_t *lwe_sk_in_array,
-                                 uint64_t *lwe_sk_out_array,
-                                 int input_lwe_dimension,
-                                 int output_lwe_dimension, int ksk_level,
-                                 int ksk_base_log, Csprng *csprng,
-                                 double variance, const unsigned repetitions) {
-
-  int ksk_size = ksk_level * (output_lwe_dimension + 1) * input_lwe_dimension;
-  int ksk_array_size = ksk_size * repetitions;
-
-  uint64_t *ksk_array = (uint64_t *)malloc(ksk_array_size * sizeof(uint64_t));
-  *d_ksk_array = (uint64_t *)cuda_malloc_async(
-      ksk_array_size * sizeof(uint64_t), stream, gpu_index);
-  int shift_in = 0;
-  int shift_out = 0;
-  int shift_ksk = 0;
-
-  for (uint r = 0; r < repetitions; r++) {
-    // Generate the keyswitch key for each repetition
-    concrete_cpu_init_lwe_keyswitch_key_u64(
-        ksk_array + (ptrdiff_t)(shift_ksk),
-        lwe_sk_in_array + (ptrdiff_t)(shift_in),
-        lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension,
-        output_lwe_dimension, ksk_level, ksk_base_log, variance, csprng,
-        &CONCRETE_CSPRNG_VTABLE);
-    uint64_t *d_ksk = *d_ksk_array + (ptrdiff_t)(shift_ksk);
-    uint64_t *ksk = ksk_array + (ptrdiff_t)(shift_ksk);
-    cuda_memcpy_async_to_gpu(d_ksk, ksk, ksk_size * sizeof(uint64_t), stream,
-                             gpu_index);
-
-    shift_in += input_lwe_dimension;
-    shift_out += output_lwe_dimension;
-    shift_ksk += ksk_size;
-  }
-  cuda_synchronize_stream(stream);
-  free(ksk_array);
-}
-
-// Generate repetitions private functional keyswitch key lists (with (k + 1)
-// keys each)
-void generate_lwe_private_functional_keyswitch_key_lists(
-    cudaStream_t *stream, int gpu_index, uint64_t **d_pksk_array,
-    uint64_t *lwe_sk_in_array, uint64_t *lwe_sk_out_array,
-    int input_lwe_dimension, int output_glwe_dimension,
-    int output_polynomial_size, int pksk_level, int pksk_base_log,
-    Csprng *csprng, double variance, const unsigned repetitions) {
-
-  int pksk_list_size = pksk_level * (output_glwe_dimension + 1) *
-                       output_polynomial_size * (input_lwe_dimension + 1) *
-                       (output_glwe_dimension + 1);
-  int pksk_array_size = pksk_list_size * repetitions;
-
-  uint64_t *pksk_array = (uint64_t *)malloc(pksk_array_size * sizeof(uint64_t));
-  *d_pksk_array = (uint64_t *)cuda_malloc_async(
-      pksk_array_size * sizeof(uint64_t), stream, gpu_index);
-  int shift_in = 0;
-  int shift_out = 0;
-  int shift_pksk_list = 0;
-
-  for (uint r = 0; r < repetitions; r++) {
-    // Generate the (k + 1) private functional keyswitch keys for each
-    // repetition
-    concrete_cpu_init_lwe_circuit_bootstrap_private_functional_packing_keyswitch_keys_u64(
-        pksk_array + (ptrdiff_t)(shift_pksk_list),
-        lwe_sk_in_array + (ptrdiff_t)(shift_in),
-        lwe_sk_out_array + (ptrdiff_t)(shift_out), input_lwe_dimension,
-        output_polynomial_size, output_glwe_dimension, pksk_level,
-        pksk_base_log, variance, Parallelism(1), csprng,
-        &CONCRETE_CSPRNG_VTABLE);
-    uint64_t *d_pksk_list = *d_pksk_array + (ptrdiff_t)(shift_pksk_list);
-    uint64_t *pksk_list = pksk_array + (ptrdiff_t)(shift_pksk_list);
-    cuda_memcpy_async_to_gpu(d_pksk_list, pksk_list,
-                             pksk_list_size * sizeof(uint64_t), stream,
-                             gpu_index);
-
-    shift_in += input_lwe_dimension;
-    shift_out += output_glwe_dimension * output_polynomial_size;
-    shift_pksk_list += pksk_list_size;
-  }
-  free(pksk_array);
-}
-
-// The closest number representable by the decomposition can be computed by
-// performing the rounding at the appropriate bit.
-uint64_t closest_representable(uint64_t input, int level_count, int base_log) {
-  // Compute the number of least significant bits which can not be represented
-  // by the decomposition
-  int non_rep_bit_count = 64 - (level_count * base_log);
-  // Generate a mask which captures the non representable bits
-  uint64_t one = 1;
-  uint64_t non_rep_mask = one << (non_rep_bit_count - 1);
-  // Retrieve the non representable bits
-  uint64_t non_rep_bits = input & non_rep_mask;
-  // Extract the msb of the  non representable bits to perform the rounding
-  uint64_t non_rep_msb = non_rep_bits >> (non_rep_bit_count - 1);
-  // Remove the non-representable bits and perform the rounding
-  uint64_t res = input >> non_rep_bit_count;
-  res += non_rep_msb;
-  return res << non_rep_bit_count;
-}
-
-uint64_t number_of_inputs_on_gpu(uint64_t gpu_index,
-                                 uint64_t lwe_ciphertext_count,
-                                 uint64_t number_of_gpus) {
-  uint64_t samples_per_gpu = lwe_ciphertext_count / number_of_gpus;
-  uint64_t samples = samples_per_gpu;
-  // We add the remainder of the integer division lwe_count/num_gpus to the load
-  // of the last GPU
-  if (gpu_index == number_of_gpus - 1) {
-    samples += lwe_ciphertext_count % number_of_gpus;
-  }
-  return samples;
-}
-
-// See tfhe-rs for more explanations
-// tfhe/src/integer/encryption.rs:152
-void encrypt_integer_u64_blocks(uint64_t **ct, uint64_t *lwe_sk,
-                                uint64_t *message_blocks, int lwe_dimension,
-                                int num_blocks, Csprng *csprng,
-                                double variance) {
-
-  for (int i = 0; i < num_blocks; i++) {
-    concrete_cpu_encrypt_lwe_ciphertext_u64(
-        lwe_sk, *ct + (ptrdiff_t)(i * (lwe_dimension + 1)), message_blocks[i],
-        lwe_dimension, variance, csprng, &CONCRETE_CSPRNG_VTABLE);
-  }
-}
-
-void decrypt_integer_u64_blocks(uint64_t *ct, uint64_t *lwe_sk,
-                                uint64_t **message_blocks, int lwe_dimension,
-                                int num_blocks, uint64_t delta,
-                                int message_modulus) {
-  uint64_t rounding_bit = delta >> 1;
-  for (int i = 0; i < num_blocks; i++) {
-    uint64_t decrypted_u64 = 0;
-    concrete_cpu_decrypt_lwe_ciphertext_u64(
-        lwe_sk, ct + (ptrdiff_t)((lwe_dimension + 1) * i), lwe_dimension,
-        &decrypted_u64);
-    uint64_t rounding = (decrypted_u64 & rounding_bit) << 1;
-    uint64_t block_value =
-        ((decrypted_u64 + rounding) / delta) % message_modulus;
-    (*message_blocks)[i] = block_value;
-  }
-}
--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/GPUDFG.hpp
@@ -7,10 +7,10 @@
 #define CONCRETELANG_GPUDFG_HPP

 #ifdef CONCRETELANG_CUDA_SUPPORT
-#include "bootstrap.h"
 #include "device.h"
 #include "keyswitch.h"
 #include "linear_algebra.h"
+#include "programmable_bootstrap.h"

 #endif

--- a/compilers/concrete-compiler/compiler/include/concretelang/Runtime/context.h
+++ b/compilers/concrete-compiler/compiler/include/concretelang/Runtime/context.h
@@ -19,9 +19,9 @@
 using ::concretelang::keysets::ServerKeyset;

 #ifdef CONCRETELANG_CUDA_SUPPORT
-#include "bootstrap.h"
 #include "device.h"
 #include "keyswitch.h"
+#include "programmable_bootstrap.h"
 #endif

 namespace mlir {
@@ -102,14 +102,14 @@ public:
    size_t bsk_gpu_buffer_size = bsk_buffer_len * sizeof(double);

    void *bsk_gpu_tmp =
-        cuda_malloc_async(bsk_gpu_buffer_size, (cudaStream_t *)stream, gpu_idx);
-    cuda_convert_lwe_bootstrap_key_64(
-        bsk_gpu_tmp, const_cast<uint64_t *>(bsk.getBuffer().data()),
-        (cudaStream_t *)stream, gpu_idx, input_lwe_dim, glwe_dim, level,
-        poly_size);
+        cuda_malloc_async(bsk_gpu_buffer_size, (cudaStream_t)stream, gpu_idx);
+    cuda_convert_lwe_programmable_bootstrap_key_64(
+        (cudaStream_t)stream, gpu_idx, bsk_gpu_tmp,
+        const_cast<uint64_t *>(bsk.getBuffer().data()), input_lwe_dim, glwe_dim,
+        level, poly_size);
    // Synchronization here is not optional as it works with mutex to
    // prevent other GPU streams from reading partially copied keys.
-    cudaStreamSynchronize(*(cudaStream_t *)stream);
+    cudaStreamSynchronize((cudaStream_t)stream);
    bsk_gpu[gpu_idx][bsk_idx] = bsk_gpu_tmp;
    return bsk_gpu[gpu_idx][bsk_idx];
  }
@@ -132,14 +132,14 @@ public:
    size_t ksk_buffer_size = sizeof(uint64_t) * ksk.getBuffer().size();

    void *ksk_gpu_tmp =
-        cuda_malloc_async(ksk_buffer_size, (cudaStream_t *)stream, gpu_idx);
+        cuda_malloc_async(ksk_buffer_size, (cudaStream_t)stream, gpu_idx);

    cuda_memcpy_async_to_gpu(ksk_gpu_tmp,
                             const_cast<uint64_t *>(ksk.getBuffer().data()),
-                             ksk_buffer_size, (cudaStream_t *)stream, gpu_idx);
+                             ksk_buffer_size, (cudaStream_t)stream, gpu_idx);
    // Synchronization here is not optional as it works with mutex to
    // prevent other GPU streams from reading partially copied keys.
-    cudaStreamSynchronize(*(cudaStream_t *)stream);
+    cudaStreamSynchronize((cudaStream_t)stream);
    ksk_gpu[gpu_idx][ksk_idx] = ksk_gpu_tmp;
    return ksk_gpu[gpu_idx][ksk_idx];
  }
--- a/compilers/concrete-compiler/compiler/lib/Runtime/CMakeLists.txt
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/CMakeLists.txt
@@ -34,7 +34,7 @@ if(CONCRETELANG_DATAFLOW_EXECUTION_ENABLED)
 endif()

 if(CONCRETELANG_CUDA_SUPPORT)
-  target_link_libraries(ConcretelangRuntime LINK_PUBLIC concrete_cuda)
+  target_link_libraries(ConcretelangRuntime LINK_PUBLIC tfhe_cuda_backend)
 endif()

 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
@@ -70,7 +70,7 @@ target_link_libraries(
         $<TARGET_OBJECTS:MLIRSparseTensorRuntime>)

 if(CONCRETELANG_CUDA_SUPPORT)
-  install(TARGETS ConcretelangRuntime omp concrete_cuda EXPORT ConcretelangRuntime)
+  install(TARGETS ConcretelangRuntime omp tfhe_cuda_backend EXPORT ConcretelangRuntime)
 else()
  install(TARGETS ConcretelangRuntime omp EXPORT ConcretelangRuntime)
 endif()
--- a/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/GPUDFG.cpp
@@ -31,6 +31,16 @@ namespace concretelang {
 namespace gpu_dfg {
 namespace {

+void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset,
+                                    uint64_t buf_size, uint32_t gpu_idx,
+                                    void *stream) {
+  size_t buf_size_ = buf_size * sizeof(uint64_t);
+  void *ct_gpu = cuda_malloc_async(buf_size_, (cudaStream_t)stream, gpu_idx);
+  cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_,
+                           (cudaStream_t)stream, gpu_idx);
+  return ct_gpu;
+}
+
 #if CONCRETELANG_TIMING_ENABLED
 static struct timespec init_timer, blocking_get_timer, acc1, acc2;
 #endif
@@ -107,22 +117,25 @@ struct Dependence;
 // is required.
 struct PBS_buffer {
  PBS_buffer(void *stream, uint32_t gpu_idx, uint32_t glwe_dimension,
-             uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count)
+             uint32_t polynomial_size, uint32_t level_count,
+             uint32_t input_lwe_ciphertext_count)
      : max_pbs_buffer_samples(input_lwe_ciphertext_count),
-        glwe_dim(glwe_dimension), poly_size(polynomial_size),
-        gpu_stream(stream), gpu_index(gpu_idx) {
-    scratch_cuda_bootstrap_amortized_64(
-        gpu_stream, gpu_index, &pbs_buffer, glwe_dim, poly_size,
-        max_pbs_buffer_samples, cuda_get_max_shared_memory(gpu_index), true);
+        glwe_dim(glwe_dimension), _level_count(level_count),
+        poly_size(polynomial_size), gpu_stream(stream), gpu_index(gpu_idx) {
+    scratch_cuda_programmable_bootstrap_64(gpu_stream, gpu_index, &pbs_buffer,
+                                           glwe_dim, poly_size, _level_count,
+                                           max_pbs_buffer_samples, true);
  }
  ~PBS_buffer() {
-    cleanup_cuda_bootstrap_amortized(gpu_stream, gpu_index, &pbs_buffer);
+    cleanup_cuda_programmable_bootstrap(gpu_stream, gpu_index, &pbs_buffer);
  }
  int8_t *get_pbs_buffer(void *stream, uint32_t gpu_idx,
                         uint32_t glwe_dimension, uint32_t polynomial_size,
+                         uint32_t level_count,
                         uint32_t input_lwe_ciphertext_count) {
    assert(glwe_dimension <= glwe_dim);
    assert(polynomial_size <= poly_size);
+    assert(level_count <= _level_count);
    assert(input_lwe_ciphertext_count <= max_pbs_buffer_samples);
    assert(stream == gpu_stream);
    assert(gpu_idx == gpu_index);
@@ -134,6 +147,7 @@ struct PBS_buffer {
  uint32_t max_pbs_buffer_samples;
  uint32_t glwe_dim;
  uint32_t poly_size;
+  uint32_t _level_count;
  void *gpu_stream;
  uint32_t gpu_index;
 };
@@ -150,13 +164,14 @@ struct GPU_state {
    if (pbs_buffer != nullptr)
      delete pbs_buffer;
    if (gpu_stream != nullptr)
-      cuda_destroy_stream((cudaStream_t *)gpu_stream, gpu_idx);
+      cuda_destroy_stream((cudaStream_t)gpu_stream, gpu_idx);
  }
  inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
-                                uint32_t polynomial_size,
+                                uint32_t polynomial_size, uint32_t level_count,
                                uint32_t input_lwe_ciphertext_count) {
    if (pbs_buffer != nullptr && (pbs_buffer->glwe_dim != glwe_dimension ||
                                  pbs_buffer->poly_size != polynomial_size ||
+                                  pbs_buffer->_level_count != level_count ||
                                  pbs_buffer->get_max_pbs_buffer_samples() <
                                      input_lwe_ciphertext_count)) {
      delete pbs_buffer;
@@ -164,9 +179,10 @@ struct GPU_state {
    }
    if (pbs_buffer == nullptr)
      pbs_buffer = new PBS_buffer(get_gpu_stream(), gpu_idx, glwe_dimension,
-                                  polynomial_size, input_lwe_ciphertext_count);
+                                  polynomial_size, level_count,
+                                  input_lwe_ciphertext_count);
    return pbs_buffer->get_pbs_buffer(get_gpu_stream(), gpu_idx, glwe_dimension,
-                                      polynomial_size,
+                                      polynomial_size, level_count,
                                      input_lwe_ciphertext_count);
  }
  inline void *get_gpu_stream() {
@@ -206,16 +222,17 @@ struct GPU_DFG {
    to_free_list.clear();
  }
  inline int8_t *get_pbs_buffer(uint32_t glwe_dimension,
-                                uint32_t polynomial_size,
+                                uint32_t polynomial_size, uint32_t level_count,
                                uint32_t input_lwe_ciphertext_count) {
    if (pbs_buffer == nullptr) {
-      int8_t *ret = gpus[gpu_idx].get_pbs_buffer(
-          glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
+      int8_t *ret =
+          gpus[gpu_idx].get_pbs_buffer(glwe_dimension, polynomial_size,
+                                       level_count, input_lwe_ciphertext_count);
      pbs_buffer = gpus[gpu_idx].pbs_buffer;
      return ret;
    }
    return pbs_buffer->get_pbs_buffer(gpu_stream, gpu_idx, glwe_dimension,
-                                      polynomial_size,
+                                      polynomial_size, level_count,
                                      input_lwe_ciphertext_count);
  }
  inline void *get_gpu_stream(int32_t loc) {
@@ -234,7 +251,7 @@ private:

 struct Dependence;
 static void sdfg_gpu_debug_print_mref(const char *c, MemRef2 m);
-static MemRef2 sdfg_gpu_debug_dependence(Dependence *d, cudaStream_t *s);
+static MemRef2 sdfg_gpu_debug_dependence(Dependence *d, cudaStream_t s);
 static bool sdfg_gpu_debug_compare_memref(MemRef2 &a, MemRef2 &b,
                                          char const *msg);

@@ -374,7 +391,7 @@ struct Dependence {
      return;
    cuda_drop_async(
        chunks[chunk_id]->device_data,
-        (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
+        (cudaStream_t)dfg->get_gpu_stream(chunks[chunk_id]->location),
        chunks[chunk_id]->location);
    chunks[chunk_id]->device_data = nullptr;
  }
@@ -385,8 +402,8 @@ struct Dependence {
      data_offset +=
          chunking_schedule[c] * host_data.sizes[1] * sizeof(uint64_t);
    size_t csize = memref_get_data_size(chunks[chunk_id]->host_data);
-    cudaStream_t *s =
-        (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location);
+    cudaStream_t s =
+        (cudaStream_t)dfg->get_gpu_stream(chunks[chunk_id]->location);
    cuda_memcpy_async_to_cpu(((char *)host_data.aligned) + data_offset,
                             chunks[chunk_id]->device_data, csize, s,
                             chunks[chunk_id]->location);
@@ -404,7 +421,7 @@ struct Dependence {
      return;
    cuda_drop_async(
        chunks[chunk_id]->device_data,
-        (cudaStream_t *)dfg->get_gpu_stream(chunks[chunk_id]->location),
+        (cudaStream_t)dfg->get_gpu_stream(chunks[chunk_id]->location),
        chunks[chunk_id]->location);
    chunks[chunk_id]->device_data = nullptr;
    chunks[chunk_id]->location =
@@ -412,8 +429,8 @@ struct Dependence {
  }
  inline void free_data(GPU_DFG *dfg, bool immediate = false) {
    if (device_data != nullptr) {
-      cuda_drop_async(device_data,
-                      (cudaStream_t *)dfg->get_gpu_stream(location), location);
+      cuda_drop_async(device_data, (cudaStream_t)dfg->get_gpu_stream(location),
+                      location);
    }
    if (onHostReady && host_data.allocated != nullptr && hostAllocated) {
      // As streams are not synchronized aside from the GET operation,
@@ -442,16 +459,16 @@ struct Dependence {
        host_data.allocated = host_data.aligned = (uint64_t *)malloc(data_size);
        hostAllocated = true;
      }
-      cudaStream_t *s = (cudaStream_t *)dfg->get_gpu_stream(location);
+      cudaStream_t s = (cudaStream_t)dfg->get_gpu_stream(location);
      cuda_memcpy_async_to_cpu(host_data.aligned, device_data, data_size, s,
                               location);
      if (synchronize)
-        cudaStreamSynchronize(*s);
+        cudaStreamSynchronize(s);
      onHostReady = true;
    } else {
      assert(onHostReady &&
             "Device-to-device data transfers not supported yet.");
-      cudaStream_t *s = (cudaStream_t *)dfg->get_gpu_stream(loc);
+      cudaStream_t s = (cudaStream_t)dfg->get_gpu_stream(loc);
      if (device_data != nullptr)
        cuda_drop_async(device_data, s, location);
      device_data = cuda_malloc_async(data_size, s, loc);
@@ -681,7 +698,7 @@ struct Stream {
      // TODO: this could be improved
      // Force deallocation with a synchronization point
      for (size_t g = 0; g < num_devices; ++g)
-        cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(g));
+        cudaStreamSynchronize((cudaStream_t)dfg->get_gpu_stream(g));
      auto status = cudaMemGetInfo(&gpu_free_mem, &gpu_total_mem);
      assert(status == cudaSuccess);
      // TODO - for now assume each device on the system has roughly same
@@ -871,7 +888,7 @@ struct Stream {
                iv->dep->free_chunk_device_data(c, dfg);
              for (auto o : outputs)
                o->dep->free_chunk_device_data(c, dfg);
-              cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(dev));
+              cudaStreamSynchronize((cudaStream_t)dfg->get_gpu_stream(dev));
            }
          },
          queue, dev));
@@ -886,7 +903,7 @@ struct Stream {
    for (auto o : outputs)
      o->dep->finalize_merged_dependence(dfg);
    for (dev = 0; dev < num_devices; ++dev)
-      cudaStreamSynchronize(*(cudaStream_t *)dfg->get_gpu_stream(dev));
+      cudaStreamSynchronize((cudaStream_t)dfg->get_gpu_stream(dev));
    // We will assume that only one subgraph is being processed per
    // DFG at a time, so we can safely free these here.
    dfg->free_stream_order_dependent_data();
@@ -1004,7 +1021,7 @@ make_process_2_1(void *dfg, void *sin1, void *sin2, void *sout,
 }

 [[maybe_unused]] static MemRef2 sdfg_gpu_debug_dependence(Dependence *d,
-                                                          cudaStream_t *s) {
+                                                          cudaStream_t s) {
  if (d->onHostReady)
    return d->host_data;
  size_t data_size = memref_get_data_size(d->host_data);
@@ -1015,7 +1032,7 @@ make_process_2_1(void *dfg, void *sin1, void *sin2, void *sout,
                 {d->host_data.sizes[0], d->host_data.sizes[1]},
                 {d->host_data.strides[0], d->host_data.strides[1]}};
  cuda_memcpy_async_to_cpu(data, d->device_data, data_size, s, d->location);
-  cudaStreamSynchronize(*s);
+  cudaStreamSynchronize(s);
  return ret;
 }

@@ -1064,17 +1081,28 @@ void memref_keyswitch_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
      return dep;
    } else {
      // Schedule the keyswitch kernel on the GPU
-      cudaStream_t *s = (cudaStream_t *)p->dfg->get_gpu_stream(loc);
+      cudaStream_t s = (cudaStream_t)p->dfg->get_gpu_stream(loc);
      void *ct0_gpu = d->device_data;
      void *out_gpu = cuda_malloc_async(data_size, s, loc);
      void *ksk_gpu = p->ctx.val->get_ksk_gpu(
          p->level.val, p->input_lwe_dim.val, p->output_lwe_dim.val, loc, s,
          p->sk_index.val);
+      // Initialize indexes
+      uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
+      for (uint32_t i = 0; i < num_samples; i++) {
+        indexes[i] = i;
+      }
+      void *indexes_gpu =
+          alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s);
+
      cuda_keyswitch_lwe_ciphertext_vector_64(
-          s, loc, out_gpu, ct0_gpu, ksk_gpu, p->input_lwe_dim.val,
-          p->output_lwe_dim.val, p->base_log.val, p->level.val, num_samples);
+          s, loc, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu,
+          p->input_lwe_dim.val, p->output_lwe_dim.val, p->base_log.val,
+          p->level.val, num_samples);
+      cuda_drop_async(indexes_gpu, s, loc);
      Dependence *dep =
          new Dependence(loc, out, out_gpu, false, false, d->chunk_id);
+      p->dfg->register_stream_order_dependent_allocation(indexes);
      return dep;
    }
  };
@@ -1108,7 +1136,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
  }

  auto sched = [&](Dependence *d0, Dependence *d1, uint64_t *glwe_ct,
-                   std::vector<size_t> &lut_indexes, cudaStream_t *s,
+                   std::vector<size_t> &lut_indexes, cudaStream_t s,
                   int32_t loc) {
    uint64_t num_samples = d0->host_data.sizes[0];
    MemRef2 out = {out_ptr,
@@ -1168,20 +1196,29 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
          cuda_malloc_async(test_vector_idxes_size, s, loc);
      cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, (void *)test_vector_idxes,
                               test_vector_idxes_size, s, loc);
+      // Initialize indexes
+      uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
+      for (uint32_t i = 0; i < num_samples; i++) {
+        indexes[i] = i;
+      }
+      void *indexes_gpu =
+          alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples, loc, s);
+
      int8_t *pbs_buffer = p->dfg->gpus[loc].get_pbs_buffer(
-          p->glwe_dim.val, p->poly_size.val, num_samples);
+          p->glwe_dim.val, p->poly_size.val, p->level.val, num_samples);
      void *ct0_gpu = d0->device_data;
      void *out_gpu = cuda_malloc_async(data_size, s, loc);
      void *fbsk_gpu = p->ctx.val->get_bsk_gpu(
          p->input_lwe_dim.val, p->poly_size.val, p->level.val, p->glwe_dim.val,
          loc, s, p->sk_index.val);
-      cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-          s, loc, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
-          fbsk_gpu, (int8_t *)pbs_buffer, p->input_lwe_dim.val, p->glwe_dim.val,
-          p->poly_size.val, p->base_log.val, p->level.val, num_samples,
-          lut_indexes.size(), lwe_idx, cuda_get_max_shared_memory(loc));
+      cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+          s, loc, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
+          ct0_gpu, indexes_gpu, fbsk_gpu, (int8_t *)pbs_buffer,
+          p->input_lwe_dim.val, p->glwe_dim.val, p->poly_size.val,
+          p->base_log.val, p->level.val, num_samples, 1, 1);
      cuda_drop_async(test_vector_idxes_gpu, s, loc);
      cuda_drop_async(glwe_ct_gpu, s, loc);
+      cuda_drop_async(indexes_gpu, s, loc);
      Dependence *dep =
          new Dependence(loc, out, out_gpu, false, false, d0->chunk_id);
      // As streams are not synchronized, we can only free this vector
@@ -1189,6 +1226,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
      // this vector is no longer needed.
      p->dfg->register_stream_order_dependent_allocation(test_vector_idxes);
      p->dfg->register_stream_order_dependent_allocation(glwe_ct);
+      p->dfg->register_stream_order_dependent_allocation(indexes);
      return dep;
    }
  };
@@ -1204,7 +1242,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
    lut_indexes.push_back(0);
  }

-  cudaStream_t *cstream = (cudaStream_t *)p->dfg->get_gpu_stream(loc);
+  cudaStream_t cstream = (cudaStream_t)p->dfg->get_gpu_stream(loc);
  Dependence *idep0 = p->input_streams[0]->get(loc, chunk_id);
  if (p->output_streams[0]->need_new_gen(chunk_id))
    p->output_streams[0]->put(
@@ -1214,7 +1252,7 @@ void memref_bootstrap_lwe_u64_process(Process *p, int32_t loc, int32_t chunk_id,
 void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc,
                                            int32_t chunk_id,
                                            uint64_t *out_ptr) {
-  auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t *s,
+  auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t s,
                   int32_t loc) {
    assert(d0->host_data.sizes[0] == d1->host_data.sizes[0]);
    assert(d0->host_data.sizes[1] == d1->host_data.sizes[1]);
@@ -1257,14 +1295,14 @@ void memref_add_lwe_ciphertexts_u64_process(Process *p, int32_t loc,
  Dependence *idep1 = p->input_streams[1]->get(loc, chunk_id);
  if (p->output_streams[0]->need_new_gen(chunk_id))
    p->output_streams[0]->put(
-        sched(idep0, idep1, (cudaStream_t *)p->dfg->get_gpu_stream(loc), loc),
+        sched(idep0, idep1, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc),
        chunk_id);
 }

 void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
                                                     int32_t chunk_id,
                                                     uint64_t *out_ptr) {
-  auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t *s,
+  auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t s,
                   int32_t loc) {
    assert(d0->host_data.sizes[0] == d1->host_data.sizes[1] ||
           d1->host_data.sizes[1] == 1);
@@ -1315,14 +1353,14 @@ void memref_add_plaintext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
  Dependence *idep1 = p->input_streams[1]->get(loc, chunk_id);
  if (p->output_streams[0]->need_new_gen(chunk_id))
    p->output_streams[0]->put(
-        sched(idep0, idep1, (cudaStream_t *)p->dfg->get_gpu_stream(loc), loc),
+        sched(idep0, idep1, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc),
        chunk_id);
 }

 void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
                                                     int32_t chunk_id,
                                                     uint64_t *out_ptr) {
-  auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t *s,
+  auto sched = [&](Dependence *d0, Dependence *d1, cudaStream_t s,
                   int32_t loc) {
    assert(d0->host_data.sizes[0] == d1->host_data.sizes[1] ||
           d1->host_data.sizes[1] == 1);
@@ -1373,14 +1411,14 @@ void memref_mul_cleartext_lwe_ciphertext_u64_process(Process *p, int32_t loc,
  Dependence *idep1 = p->input_streams[1]->get(loc, chunk_id);
  if (p->output_streams[0]->need_new_gen(chunk_id))
    p->output_streams[0]->put(
-        sched(idep0, idep1, (cudaStream_t *)p->dfg->get_gpu_stream(loc), loc),
+        sched(idep0, idep1, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc),
        chunk_id);
 }

 void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc,
                                              int32_t chunk_id,
                                              uint64_t *out_ptr) {
-  auto sched = [&](Dependence *d0, cudaStream_t *s, int32_t loc) {
+  auto sched = [&](Dependence *d0, cudaStream_t s, int32_t loc) {
    uint64_t num_samples = d0->host_data.sizes[0];
    MemRef2 out = {out_ptr,
                   out_ptr,
@@ -1415,8 +1453,7 @@ void memref_negate_lwe_ciphertext_u64_process(Process *p, int32_t loc,
  Dependence *idep0 = p->input_streams[0]->get(loc, chunk_id);
  if (p->output_streams[0]->need_new_gen(chunk_id))
    p->output_streams[0]->put(
-        sched(idep0, (cudaStream_t *)p->dfg->get_gpu_stream(loc), loc),
-        chunk_id);
+        sched(idep0, (cudaStream_t)p->dfg->get_gpu_stream(loc), loc), chunk_id);
 }

 } // namespace
--- a/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
+++ b/compilers/concrete-compiler/compiler/lib/Runtime/wrappers.cpp
@@ -45,9 +45,9 @@ void *alloc_and_memcpy_async_to_gpu(uint64_t *buf_ptr, uint64_t buf_offset,
                                    uint64_t buf_size, uint32_t gpu_idx,
                                    void *stream) {
  size_t buf_size_ = buf_size * sizeof(uint64_t);
-  void *ct_gpu = cuda_malloc_async(buf_size_, (cudaStream_t *)stream, gpu_idx);
+  void *ct_gpu = cuda_malloc_async(buf_size_, (cudaStream_t)stream, gpu_idx);
  cuda_memcpy_async_to_gpu(ct_gpu, buf_ptr + buf_offset, buf_size_,
-                           (cudaStream_t *)stream, gpu_idx);
+                           (cudaStream_t)stream, gpu_idx);
  return ct_gpu;
 }

@@ -55,7 +55,7 @@ void memcpy_async_to_cpu(uint64_t *buf_ptr, uint64_t buf_offset,
                         uint64_t buf_size, void *buf_gpu, uint32_t gpu_idx,
                         void *stream) {
  cuda_memcpy_async_to_cpu(buf_ptr + buf_offset, buf_gpu,
-                           buf_size * sizeof(uint64_t), (cudaStream_t *)stream,
+                           buf_size * sizeof(uint64_t), (cudaStream_t)stream,
                           gpu_idx);
 }

@@ -132,21 +132,31 @@ void memref_batched_keyswitch_lwe_cuda_u64(
  // Move the input and output batch of ciphertexts to the GPU
  // TODO: The allocation should be done by the compiler codegen
  void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
-      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
+      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t)stream);
+  // Initialize indexes
+  uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
+  for (uint32_t i = 0; i < num_samples; i++) {
+    indexes[i] = i;
+  }
+  void *indexes_gpu =
+      alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
+                                    gpu_idx, (cudaStream_t)stream);
  void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
-                                    (cudaStream_t *)stream, gpu_idx);
+                                    (cudaStream_t)stream, gpu_idx);
  // Run the keyswitch kernel on the GPU
  cuda_keyswitch_lwe_ciphertext_vector_64(
-      stream, gpu_idx, out_gpu, ct0_gpu, ksk_gpu, input_lwe_dim, output_lwe_dim,
-      base_log, level, num_samples);
+      stream, gpu_idx, out_gpu, indexes_gpu, ct0_gpu, indexes_gpu, ksk_gpu,
+      input_lwe_dim, output_lwe_dim, base_log, level, num_samples);
  // Copy the output batch of ciphertext back to CPU
  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
                      stream);
  cuda_synchronize_device(gpu_idx);
  // free memory that we allocated on gpu
+  cuda_drop(indexes_gpu, gpu_idx);
  cuda_drop(ct0_gpu, gpu_idx);
  cuda_drop(out_gpu, gpu_idx);
-  cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
+  cuda_destroy_stream((cudaStream_t)stream, gpu_idx);
+  free(indexes);
 }

 void memref_batched_bootstrap_lwe_cuda_u64(
@@ -178,9 +188,9 @@ void memref_batched_bootstrap_lwe_cuda_u64(
  // Move the input and output batch of ciphertext to the GPU
  // TODO: The allocation should be done by the compiler codegen
  void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
-      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
+      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t)stream);
  void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
-                                    (cudaStream_t *)stream, gpu_idx);
+                                    (cudaStream_t)stream, gpu_idx);
  // Construct the glwe accumulator (on CPU)
  // TODO: Should be done outside of the bootstrap call, compile time if
  // possible. Refactor in progress
@@ -198,41 +208,49 @@ void memref_batched_bootstrap_lwe_cuda_u64(

  // Move the glwe accumulator to the GPU
  void *glwe_ct_gpu = alloc_and_memcpy_async_to_gpu(
-      glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t *)stream);
+      glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t)stream);

  // Move test vector indexes to the GPU, the test vector indexes is set of 0
  uint32_t num_test_vectors = 1, lwe_idx = 0,
           test_vector_idxes_size = num_samples * sizeof(uint64_t);
  void *test_vector_idxes = malloc(test_vector_idxes_size);
  memset(test_vector_idxes, 0, test_vector_idxes_size);
-  void *test_vector_idxes_gpu = cuda_malloc_async(
-      test_vector_idxes_size, (cudaStream_t *)stream, gpu_idx);
+  void *test_vector_idxes_gpu =
+      cuda_malloc_async(test_vector_idxes_size, (cudaStream_t)stream, gpu_idx);
  cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, test_vector_idxes,
-                           test_vector_idxes_size, (cudaStream_t *)stream,
+                           test_vector_idxes_size, (cudaStream_t)stream,
                           gpu_idx);
+  // Initialize indexes
+  uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
+  for (uint32_t i = 0; i < num_samples; i++) {
+    indexes[i] = i;
+  }
+  void *indexes_gpu =
+      alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
+                                    gpu_idx, (cudaStream_t)stream);
  // Allocate PBS buffer on GPU
-  scratch_cuda_bootstrap_amortized_64(
-      stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples,
-      cuda_get_max_shared_memory(gpu_idx), true);
+  scratch_cuda_programmable_bootstrap_64(stream, gpu_idx, &pbs_buffer, glwe_dim,
+                                         poly_size, level, num_samples, true);
  // Run the bootstrap kernel on the GPU
-  cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-      stream, gpu_idx, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
-      fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level,
-      num_samples, num_test_vectors, lwe_idx,
-      cuda_get_max_shared_memory(gpu_idx));
-  cleanup_cuda_bootstrap_amortized(stream, gpu_idx, &pbs_buffer);
+  cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+      stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
+      ct0_gpu, indexes_gpu, fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim,
+      poly_size, base_log, level, num_samples, 1, 1);
+  cleanup_cuda_programmable_bootstrap(stream, gpu_idx, &pbs_buffer);
  // Copy the output batch of ciphertext back to CPU
  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
                      stream);
  // free memory that we allocated on gpu
-  cuda_drop_async(ct0_gpu, (cudaStream_t *)stream, gpu_idx);
-  cuda_drop_async(out_gpu, (cudaStream_t *)stream, gpu_idx);
-  cuda_drop_async(glwe_ct_gpu, (cudaStream_t *)stream, gpu_idx);
-  cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t *)stream, gpu_idx);
-  cudaStreamSynchronize(*(cudaStream_t *)stream);
+  cuda_drop_async(indexes_gpu, (cudaStream_t)stream, gpu_idx);
+  cuda_drop_async(ct0_gpu, (cudaStream_t)stream, gpu_idx);
+  cuda_drop_async(out_gpu, (cudaStream_t)stream, gpu_idx);
+  cuda_drop_async(glwe_ct_gpu, (cudaStream_t)stream, gpu_idx);
+  cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t)stream, gpu_idx);
+  cudaStreamSynchronize((cudaStream_t)stream);
  // Free the glwe accumulator (on CPU)
  free(glwe_ct);
-  cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
+  free(indexes);
+  cuda_destroy_stream((cudaStream_t)stream, gpu_idx);
 }

 void memref_batched_mapped_bootstrap_lwe_cuda_u64(
@@ -268,9 +286,9 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64(
  // Move the input and output batch of ciphertext to the GPU
  // TODO: The allocation should be done by the compiler codegen
  void *ct0_gpu = alloc_and_memcpy_async_to_gpu(
-      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t *)stream);
+      ct0_aligned, ct0_offset, ct0_batch_size, gpu_idx, (cudaStream_t)stream);
  void *out_gpu = cuda_malloc_async(out_batch_size * sizeof(uint64_t),
-                                    (cudaStream_t *)stream, gpu_idx);
+                                    (cudaStream_t)stream, gpu_idx);
  // Construct the glwe accumulator (on CPU)
  // TODO: Should be done outside of the bootstrap call, compile time if
  // possible. Refactor in progress
@@ -291,7 +309,7 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64(

  // Move the glwe accumulator to the GPU
  void *glwe_ct_gpu = alloc_and_memcpy_async_to_gpu(
-      glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t *)stream);
+      glwe_ct, 0, glwe_ct_size, gpu_idx, (cudaStream_t)stream);

  // Move test vector indexes to the GPU, the test vector indexes is set of 0
  uint32_t lwe_idx = 0, test_vector_idxes_size = num_samples * sizeof(uint64_t);
@@ -303,34 +321,43 @@ void memref_batched_mapped_bootstrap_lwe_cuda_u64(
    for (size_t i = 0; i < num_lut_vectors; ++i)
      test_vector_idxes[i] = i;
  }
-  void *test_vector_idxes_gpu = cuda_malloc_async(
-      test_vector_idxes_size, (cudaStream_t *)stream, gpu_idx);
+  void *test_vector_idxes_gpu =
+      cuda_malloc_async(test_vector_idxes_size, (cudaStream_t)stream, gpu_idx);
  cuda_memcpy_async_to_gpu(test_vector_idxes_gpu, (void *)test_vector_idxes,
-                           test_vector_idxes_size, (cudaStream_t *)stream,
+                           test_vector_idxes_size, (cudaStream_t)stream,
                           gpu_idx);
+  // Initialize indexes
+  uint64_t *indexes = (uint64_t *)malloc(num_samples * sizeof(uint64_t));
+  for (uint32_t i = 0; i < num_samples; i++) {
+    indexes[i] = i;
+  }
+  void *indexes_gpu =
+      alloc_and_memcpy_async_to_gpu(indexes, 0, num_samples * sizeof(uint64_t),
+                                    gpu_idx, (cudaStream_t)stream);
+
  // Allocate PBS buffer on GPU
-  scratch_cuda_bootstrap_amortized_64(
-      stream, gpu_idx, &pbs_buffer, glwe_dim, poly_size, num_samples,
-      cuda_get_max_shared_memory(gpu_idx), true);
+  scratch_cuda_programmable_bootstrap_64(stream, gpu_idx, &pbs_buffer, glwe_dim,
+                                         poly_size, level, num_samples, true);
  // Run the bootstrap kernel on the GPU
-  cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
-      stream, gpu_idx, out_gpu, glwe_ct_gpu, test_vector_idxes_gpu, ct0_gpu,
-      fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim, poly_size, base_log, level,
-      num_samples, num_lut_vectors, lwe_idx,
-      cuda_get_max_shared_memory(gpu_idx));
-  cleanup_cuda_bootstrap_amortized(stream, gpu_idx, &pbs_buffer);
+  cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
+      stream, gpu_idx, out_gpu, indexes_gpu, glwe_ct_gpu, test_vector_idxes_gpu,
+      ct0_gpu, indexes_gpu, fbsk_gpu, pbs_buffer, input_lwe_dim, glwe_dim,
+      poly_size, base_log, level, num_samples, 1, 1);
+  cleanup_cuda_programmable_bootstrap(stream, gpu_idx, &pbs_buffer);
  // Copy the output batch of ciphertext back to CPU
  memcpy_async_to_cpu(out_aligned, out_offset, out_batch_size, out_gpu, gpu_idx,
                      stream);
  // free memory that we allocated on gpu
-  cuda_drop_async(ct0_gpu, (cudaStream_t *)stream, gpu_idx);
-  cuda_drop_async(out_gpu, (cudaStream_t *)stream, gpu_idx);
-  cuda_drop_async(glwe_ct_gpu, (cudaStream_t *)stream, gpu_idx);
-  cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t *)stream, gpu_idx);
-  cudaStreamSynchronize(*(cudaStream_t *)stream);
+  cuda_drop_async(indexes_gpu, (cudaStream_t)stream, gpu_idx);
+  cuda_drop_async(ct0_gpu, (cudaStream_t)stream, gpu_idx);
+  cuda_drop_async(out_gpu, (cudaStream_t)stream, gpu_idx);
+  cuda_drop_async(glwe_ct_gpu, (cudaStream_t)stream, gpu_idx);
+  cuda_drop_async(test_vector_idxes_gpu, (cudaStream_t)stream, gpu_idx);
+  cudaStreamSynchronize((cudaStream_t)stream);
  // Free the glwe accumulator (on CPU)
+  free(indexes);
  free(glwe_ct);
-  cuda_destroy_stream((cudaStream_t *)stream, gpu_idx);
+  cuda_destroy_stream((cudaStream_t)stream, gpu_idx);
 }

 #endif
--- a/third_party/tfhe-rs
+++ b/third_party/tfhe-rs
				`@@ -0,0 +1 @@`
				`../../third_party/tfhe-rs/backends/tfhe-cuda-backend/cuda/`