tfhe-rs/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

#ifndef CUDA_INTEGER_UTILITIES_H
#define CUDA_INTEGER_UTILITIES_H

#include "integer.h"
#include "integer/radix_ciphertext.cuh"
#include "integer/radix_ciphertext.h"
#include "keyswitch/keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include "utils/helper_multi_gpu.cuh"
#include <cmath>
#include <functional>
#include <queue>

#include <stdio.h>

#include "crypto/keyswitch.cuh"

class NoiseLevel {
public:
  // Constants equivalent to the Rust code
  static const uint64_t NOMINAL = 1;
  static const uint64_t ZERO = 0;
  static const uint64_t UNKNOWN = std::numeric_limits<uint64_t>::max();
};

#ifdef DEBUG
#define CHECK_NOISE_LEVEL(noise_level_expr, msg_mod, carry_mod)                \
  do {                                                                         \
    if ((msg_mod) == 2 && (carry_mod) == 2) {                                  \
      constexpr int max_noise_level = 3;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
        PANIC("Cuda error: noise exceeds maximum authorized value for 1_1 "    \
              "parameters");                                                   \
    } else if ((msg_mod) == 4 && (carry_mod) == 4) {                           \
      constexpr int max_noise_level = 5;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
        PANIC("Cuda error: noise exceeds maximum authorized value for 2_2 "    \
              "parameters");                                                   \
    } else if ((msg_mod) == 8 && (carry_mod) == 8) {                           \
      constexpr int max_noise_level = 9;                                       \
      if ((noise_level_expr) > max_noise_level)                                \
        PANIC("Cuda error: noise exceeds maximum authorized value for 3_3 "    \
              "parameters");                                                   \
    } else if ((msg_mod) == 0 && (carry_mod) == 0) {                           \
      break;                                                                   \
    } else if ((msg_mod) == 4 && (carry_mod) == 32) {                          \
      break;                                                                   \
    } else {                                                                   \
      PANIC("Invalid message modulus or carry modulus")                        \
    }                                                                          \
  } while (0)
#else
#define CHECK_NOISE_LEVEL(noise_level_expr, message_modulus, carry_modulus)    \
  do {                                                                         \
  } while (0)
#endif

template <typename Torus>
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
                                          uint32_t value, uint32_t blocks_count,
                                          uint32_t lwe_size);
void generate_ids_update_degrees(uint64_t *terms_degree, size_t *h_lwe_idx_in,
                                 size_t *h_lwe_idx_out,
                                 int32_t *h_smart_copy_in,
                                 int32_t *h_smart_copy_out, size_t ch_amount,
                                 uint32_t num_radix, uint32_t num_blocks,
                                 size_t chunk_size, size_t message_max,
                                 size_t &total_count, size_t &message_count,
                                 size_t &carry_count, size_t &sm_copy_count);
/*
 *  generate bivariate accumulator (lut) for device pointer
 *    stream - cuda stream
 *    acc_bivariate - device pointer for bivariate accumulator
 *    ...
 *    f - wrapping function with two Torus inputs
 */
template <typename Torus>
void generate_device_accumulator_bivariate(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
    uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
    std::function<Torus(Torus, Torus)> f, bool gpu_memory_allocated);

template <typename Torus>
void generate_device_accumulator_bivariate_with_factor(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate,
    uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension,
    uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus,
    std::function<Torus(Torus, Torus)> f, int factor,
    bool gpu_memory_allocated);

template <typename Torus>
void generate_device_accumulator_with_encoding(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t input_message_modulus, uint32_t input_carry_modulus,
    uint32_t output_message_modulus, uint32_t output_carry_modulus,
    std::function<Torus(Torus)> f, bool gpu_memory_allocated);

template <typename Torus>
void generate_device_accumulator_no_encoding(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
    uint32_t message_modulus, uint32_t carry_modulus, uint32_t glwe_dimension,
    uint32_t polynomial_size, std::function<Torus(uint32_t)> f,
    bool gpu_memory_allocated);

/*
 *  generate univariate accumulator (lut) for device pointer
 *    stream - cuda stream
 *    acc - device pointer for univariate accumulator
 *    ...
 *    f - evaluating function with one Torus input
 */
template <typename Torus>
void generate_device_accumulator(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree,
    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t message_modulus, uint32_t carry_modulus,
    std::function<Torus(Torus)> f, bool gpu_memory_allocated);

template <typename Torus>
void generate_many_lut_device_accumulator(
    cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degrees,
    uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size,
    uint32_t message_modulus, uint32_t carry_modulus,
    std::vector<std::function<Torus(Torus)>> &f, bool gpu_memory_allocated);

struct radix_columns {
  std::vector<uint32_t> columns_counter;
  uint32_t num_blocks;
  uint32_t num_radix_in_vec;
  uint32_t chunk_size;
  radix_columns(const uint64_t *const input_degrees, uint32_t num_blocks,
                uint32_t num_radix_in_vec, uint32_t chunk_size,
                bool &needs_processing)
      : num_blocks(num_blocks), num_radix_in_vec(num_radix_in_vec),
        chunk_size(chunk_size) {
    needs_processing = false;
    columns_counter.resize(num_blocks, 0);
    for (uint32_t i = 0; i < num_radix_in_vec; ++i) {
      for (uint32_t j = 0; j < num_blocks; ++j) {
        if (input_degrees[i * num_blocks + j])
          columns_counter[j] += 1;
      }
    }

    for (uint32_t i = 0; i < num_blocks; ++i) {
      if (columns_counter[i] > chunk_size) {
        needs_processing = true;
        break;
      }
    }
  }

  void next_accumulation(uint32_t &total_ciphertexts,
                         uint32_t &message_ciphertexts,
                         bool &needs_processing) {
    message_ciphertexts = 0;
    total_ciphertexts = 0;
    needs_processing = false;
    for (int i = num_blocks - 1; i > 0; --i) {
      uint32_t cur_count = columns_counter[i];
      uint32_t prev_count = columns_counter[i - 1];
      uint32_t new_count = 0;

      // accumulated_blocks from current columns
      new_count += cur_count / chunk_size;
      // all accumulated message blocks needs pbs
      message_ciphertexts += new_count;
      // carry blocks from previous columns
      new_count += prev_count / chunk_size;
      // both carry and message blocks that needs pbs
      total_ciphertexts += new_count;
      // now add remaining non accumulated blocks that does not require pbs
      new_count += cur_count % chunk_size;

      columns_counter[i] = new_count;

      if (new_count > chunk_size)
        needs_processing = true;
    }

    // now do it for 0th block
    uint32_t new_count = columns_counter[0] / chunk_size;
    message_ciphertexts += new_count;
    total_ciphertexts += new_count;
    new_count += columns_counter[0] % chunk_size;
    columns_counter[0] = new_count;

    if (new_count > chunk_size) {
      needs_processing = true;
    }
  }
};

inline void calculate_final_degrees(uint64_t *const out_degrees,
                                    const uint64_t *const input_degrees,
                                    uint32_t num_blocks,
                                    uint32_t num_radix_in_vec,
                                    uint32_t chunk_size,
                                    uint64_t message_modulus) {

  auto get_degree = [message_modulus](uint64_t degree) -> uint64_t {
    return std::min(message_modulus - 1, degree);
  };
  std::vector<std::queue<uint64_t>> columns(num_blocks);
  for (uint32_t i = 0; i < num_radix_in_vec; ++i) {
    for (uint32_t j = 0; j < num_blocks; ++j) {
      if (input_degrees[i * num_blocks + j])
        columns[j].push(input_degrees[i * num_blocks + j]);
    }
  }

  for (uint32_t i = 0; i < num_blocks; ++i) {
    auto &col = columns[i];
    while (col.size() > 1) {
      uint32_t cur_degree = 0;
      uint32_t mn = std::min(chunk_size, (uint32_t)col.size());
      for (int j = 0; j < mn; ++j) {
        cur_degree += col.front();
        col.pop();
      }
      const uint64_t new_degree = get_degree(cur_degree);
      col.push(new_degree);
      if ((i + 1) < num_blocks) {
        columns[i + 1].push(new_degree);
      }
    }
  }

  for (int i = 0; i < num_blocks; i++) {
    out_degrees[i] = (columns[i].empty()) ? 0 : columns[i].front();
  }
}

struct int_radix_params {
  PBS_TYPE pbs_type;
  uint32_t glwe_dimension;
  uint32_t polynomial_size;
  uint32_t big_lwe_dimension;
  uint32_t small_lwe_dimension;
  uint32_t ks_level;
  uint32_t ks_base_log;
  uint32_t pbs_level;
  uint32_t pbs_base_log;
  uint32_t grouping_factor;
  uint32_t message_modulus;
  uint32_t carry_modulus;
  PBS_MS_REDUCTION_T noise_reduction_type;

  int_radix_params(PBS_TYPE pbs_type, uint32_t glwe_dimension,
                   uint32_t polynomial_size, uint32_t big_lwe_dimension,
                   uint32_t small_lwe_dimension, uint32_t ks_level,
                   uint32_t ks_base_log, uint32_t pbs_level,
                   uint32_t pbs_base_log, uint32_t grouping_factor,
                   uint32_t message_modulus, uint32_t carry_modulus,
                   PBS_MS_REDUCTION_T noise_reduction_type)

      : pbs_type(pbs_type), glwe_dimension(glwe_dimension),
        polynomial_size(polynomial_size), big_lwe_dimension(big_lwe_dimension),
        small_lwe_dimension(small_lwe_dimension), ks_level(ks_level),
        ks_base_log(ks_base_log), pbs_level(pbs_level),
        pbs_base_log(pbs_base_log), grouping_factor(grouping_factor),
        message_modulus(message_modulus), carry_modulus(carry_modulus),
        noise_reduction_type(noise_reduction_type){};

  int_radix_params() = default;

  void print() {
    printf("pbs_type: %u, glwe_dimension: %u, "
           "polynomial_size: %u, "
           "big_lwe_dimension: %u, "
           "small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
           "%u, pbs_base_log: "
           "%u, grouping_factor: %u, message_modulus: %u, carry_modulus: %u\n",
           pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension,
           small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log,
           grouping_factor, message_modulus, carry_modulus);
  };
};

// Store things needed to apply LUTs
template <typename InputTorus, typename OutputTorus>
struct int_radix_lut_custom_input_output {
  int_radix_params params;
  // The number of blocks to be processed by the LUT. Can be
  // smaller than the actual num_input_blocks because some LUT types
  // (like noise squashing), perform packing.
  uint32_t num_blocks = 0;
  // The number of blocks of the input ciphertext. For noise
  // squashing these blocks are packed into num_blocks
  uint32_t num_input_blocks = 0;
  // Number of LUTs to store in this structure
  uint32_t num_luts = 0;
  // ManyLUT is the mechanism to apply several LUTs in a single PBS
  uint32_t num_many_lut = 1;
  // The LWE dimension of the KS output / PBS input. Initialized
  // to the max value so that we crash if this value is set incorrectly
  // by the caller
  uint32_t input_big_lwe_dimension = (uint32_t)-1;

  // Tracks the degree of each LUT and the max degree on CPU
  // The max degree is (message_modulus * carry_modulus - 1) except for many lut
  // for which it's different
  uint64_t *degrees = nullptr;
  uint64_t *max_degrees = nullptr;

  CudaStreams active_streams;
  bool mem_reuse = false;

  // There will be one buffer on each GPU in multi-GPU computations
  // (same for tmp lwe arrays)
  std::vector<int8_t *> buffer;

  // These arrays will reside on all GPUs
  // lut could actually be allocated & initialized GPU per GPU but this is not
  // done at the moment
  std::vector<OutputTorus *> lut_vec;
  std::vector<InputTorus *> lut_indexes_vec;
  InputTorus *h_lut_indexes = nullptr;
  // All tmp lwe arrays and index arrays for lwe contain the total
  // amount of blocks to be computed on, there is no split between GPUs
  // for the moment
  InputTorus *lwe_indexes_in = nullptr;
  InputTorus *lwe_indexes_out = nullptr;
  InputTorus *h_lwe_indexes_in = nullptr;
  InputTorus *h_lwe_indexes_out = nullptr;
  // Enable optimizations if lwe_indexes_(in/out) are trivial
  bool using_trivial_lwe_indexes = true;
  // lwe_trivial_indexes is the intermediary index we need in case
  // lwe_indexes_in != lwe_indexes_out
  InputTorus *lwe_trivial_indexes = nullptr;

  // buffer to store packed message bits of a radix ciphertext
  CudaRadixCiphertextFFI *tmp_lwe_before_ks = nullptr;

  /// For multi GPU execution we create vectors of pointers for inputs and
  /// outputs
  std::vector<InputTorus *> lwe_array_in_vec;
  std::vector<InputTorus *> lwe_after_ks_vec;
  std::vector<OutputTorus *> lwe_after_pbs_vec;
  std::vector<InputTorus *> lwe_trivial_indexes_vec;
  std::vector<ks_mem<InputTorus> *>
      ks_tmp_buf_vec; // buffers on each GPU to store keyswitch temporary data

  std::vector<InputTorus *> lwe_aligned_vec;

  bool gpu_memory_allocated;

  CudaStreamsBarrier multi_gpu_scatter_barrier, multi_gpu_broadcast_barrier;
  CudaStreamsBarrier multi_gpu_gather_barrier;

  // Setup the LUT configuration:
  // input_big_lwe_dimension: BIG LWE dimension of the KS output / PBS input
  // params: cryptographic parameters of the PBS output
  // num_luts: number of LUTs (or many-LUT sets) in this structure
  // num_many_lut: number of LUTs to apply in a single PBS pass
  // num_radix_blocks: number of blocks in the radix integer
  void setup_config_and_degrees(CudaStreams streams,
                                uint32_t input_big_lwe_dimension,
                                int_radix_params params, uint32_t num_luts,
                                uint32_t num_many_lut,
                                uint32_t num_radix_blocks,
                                uint32_t num_input_blocks,
                                bool allocate_gpu_memory) {
    this->params = params;
    this->num_blocks = num_radix_blocks;
    this->num_luts = num_luts;
    this->num_many_lut = num_many_lut;
    this->input_big_lwe_dimension = input_big_lwe_dimension;
    this->num_input_blocks = num_input_blocks;
    this->gpu_memory_allocated = allocate_gpu_memory;

    this->active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  }

  void setup_degrees() {
    this->degrees =
        (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t));
    this->max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t));
  }

  void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {

    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

    for (uint i = 0; i < active_streams.count(); i++) {
      cuda_set_device(active_streams.gpu_index(i));
      int8_t *gpu_pbs_buffer;
      auto num_blocks_on_gpu = std::min(
          (int)num_radix_blocks,
          std::max(threshold, get_num_inputs_on_gpu(num_radix_blocks, i,
                                                    active_streams.count())));

      uint64_t size = 0;
      execute_scratch_pbs<OutputTorus>(
          active_streams.stream(i), active_streams.gpu_index(i),
          &gpu_pbs_buffer, params.glwe_dimension, params.small_lwe_dimension,
          params.polynomial_size, params.pbs_level, params.grouping_factor,
          num_blocks_on_gpu, params.pbs_type, allocate_gpu_memory,
          params.noise_reduction_type, size);
      if (i == 0) {
        size_tracker += size;
      }
      buffer.push_back(gpu_pbs_buffer);
    }

    // This buffer is created with num_input_blocks since it
    // stores the ciphertext before KS or packing.
    tmp_lwe_before_ks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<InputTorus>(
        active_streams.stream(0), active_streams.gpu_index(0),
        tmp_lwe_before_ks, num_input_blocks, input_big_lwe_dimension,
        size_tracker, allocate_gpu_memory);
  }

  void alloc_and_init_multi_gpu_buffers(int_radix_params params,
                                        uint32_t num_radix_blocks,
                                        bool allocate_gpu_memory,
                                        uint64_t &size_tracker) {
    GPU_ASSERT(lwe_array_in_vec.empty(), "Multi GPU buffers already allocated");

    /// With multiple GPUs we allocate arrays to be pushed to the vectors and
    /// copy data on each GPU then when we gather data to GPU 0 we can copy
    /// back to the original indexing
    multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
                              num_radix_blocks, params.big_lwe_dimension + 1,
                              size_tracker, params.pbs_type,
                              allocate_gpu_memory);
    multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
                              num_radix_blocks, params.small_lwe_dimension + 1,
                              size_tracker, params.pbs_type,
                              allocate_gpu_memory);
    if (num_many_lut > 1) {
      multi_gpu_alloc_lwe_many_lut_output_async(
          active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
          params.big_lwe_dimension + 1, size_tracker, params.pbs_type,
          allocate_gpu_memory);
    } else {
      multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
                                num_radix_blocks, params.big_lwe_dimension + 1,
                                size_tracker, params.pbs_type,
                                allocate_gpu_memory);
    }
    multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
                                num_radix_blocks, size_tracker,
                                allocate_gpu_memory);
    cuda_synchronize_stream(active_streams.stream(0),
                            active_streams.gpu_index(0));

    // This call will not copy if allocate_gpu_memory is false
    // thus it's safe to call it on a null source pointer
    multi_gpu_copy_array_async(active_streams, lwe_trivial_indexes_vec,
                               lwe_trivial_indexes, num_radix_blocks,
                               allocate_gpu_memory);
  }

  void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

    auto inputs_on_gpu = std::min(
        (int)num_input_blocks,
        std::max(threshold, get_num_inputs_on_gpu(num_input_blocks, 0,
                                                  active_streams.count())));

    if (inputs_on_gpu >= get_threshold_ks_gemm()) {
      for (auto i = 0; i < active_streams.count(); ++i) {
        ks_mem<InputTorus> *ks_buffer;
        uint64_t sub_size_tracker = scratch_cuda_keyswitch<InputTorus>(
            active_streams.stream(i), active_streams.gpu_index(i), &ks_buffer,
            input_big_lwe_dimension, params.small_lwe_dimension, num_blocks,
            gpu_memory_allocated);

        if (i == 0) {
          size_tracker += sub_size_tracker;
        }
        ks_tmp_buf_vec.push_back(ks_buffer);
      }
    }
  }

  void setup_mem_reuse(uint32_t num_radix_blocks,
                       int_radix_lut_custom_input_output *base_lut_object) {
    // base lut object should have bigger or equal memory than current one
    if (num_radix_blocks > base_lut_object->num_blocks)
      PANIC("Cuda error: lut does not have enough blocks")
    // pbs
    buffer = base_lut_object->buffer;
    // Keyswitch
    tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;

    /// With multiple GPUs we allocate arrays to be pushed to the vectors and
    /// copy data on each GPU then when we gather data to GPU 0 we can copy back
    /// to the original indexing
    lwe_array_in_vec = base_lut_object->lwe_array_in_vec;
    lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec;
    lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec;
    lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;

    ks_tmp_buf_vec = base_lut_object->ks_tmp_buf_vec;

    mem_reuse = true;
  }

  void setup_lwe_trivial_indices(uint32_t num_radix_blocks,
                                 bool allocate_gpu_memory,
                                 uint64_t &size_tracker) {
    // lwe_(input/output)_indexes are initialized to range(num_radix_blocks)
    // by default
    lwe_indexes_in = (InputTorus *)cuda_malloc_with_size_tracking_async(
        num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
        active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);
    lwe_indexes_out = (InputTorus *)cuda_malloc_with_size_tracking_async(
        num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
        active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);
    lwe_trivial_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
        num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
        active_streams.gpu_index(0), size_tracker, allocate_gpu_memory);

    h_lwe_indexes_in =
        (InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));
    h_lwe_indexes_out =
        (InputTorus *)malloc(num_radix_blocks * sizeof(InputTorus));

    for (int i = 0; i < num_radix_blocks; i++)
      h_lwe_indexes_in[i] = i;

    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes_in, h_lwe_indexes_in, num_radix_blocks * sizeof(InputTorus),
        active_streams.stream(0), active_streams.gpu_index(0),
        allocate_gpu_memory);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes_out, h_lwe_indexes_in,
        num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
        active_streams.gpu_index(0), allocate_gpu_memory);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_trivial_indexes, h_lwe_indexes_in,
        num_radix_blocks * sizeof(InputTorus), active_streams.stream(0),
        active_streams.gpu_index(0), allocate_gpu_memory);
    memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
           num_radix_blocks * sizeof(InputTorus));

    h_lut_indexes =
        (InputTorus *)(calloc(num_radix_blocks, sizeof(InputTorus)));
  }

  void setup_multi_gpu(int_radix_params params, uint32_t num_radix_blocks,
                       bool allocate_gpu_memory, uint64_t &size_tracker) {

    if (!mem_reuse)
      alloc_and_init_multi_gpu_buffers(params, num_radix_blocks,
                                       allocate_gpu_memory, size_tracker);

    if (active_streams.count() > 1) {
      multi_gpu_gather_barrier.create_on(active_streams);
      multi_gpu_broadcast_barrier.create_on(active_streams);
      multi_gpu_scatter_barrier.create_on(active_streams);
    }
  }

  int_radix_lut_custom_input_output(CudaStreams streams,
                                    int_radix_params params, uint32_t num_luts,
                                    uint32_t num_radix_blocks,
                                    bool allocate_gpu_memory,
                                    uint64_t &size_tracker) {

    setup_config_and_degrees(streams, params.big_lwe_dimension, params,
                             num_luts, 1, num_radix_blocks, num_radix_blocks,
                             allocate_gpu_memory);

    setup_degrees();

    allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
                         size_tracker);

    allocate_luts_and_indexes(num_radix_blocks, size_tracker);

    setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
                              size_tracker);

    setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
                    size_tracker);
  }

  // Constructor for noise squashing LUT which packs the input
  // ciphertext of num_input_blocks blocks into a new one with fewer blocks,
  // num_radix_blocks
  int_radix_lut_custom_input_output(CudaStreams streams,
                                    uint32_t input_big_lwe_dimension,
                                    int_radix_params params, uint32_t num_luts,
                                    uint32_t num_radix_blocks,
                                    uint32_t num_input_blocks,
                                    bool allocate_gpu_memory,
                                    uint64_t &size_tracker) {

    setup_config_and_degrees(streams, input_big_lwe_dimension, params, num_luts,
                             1, num_radix_blocks, num_input_blocks,
                             allocate_gpu_memory);

    setup_degrees();

    allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
                         size_tracker);

    allocate_luts_and_indexes(num_radix_blocks, size_tracker);

    setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
                              size_tracker);

    setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
                    size_tracker);
  }

  // constructor to reuse memory
  int_radix_lut_custom_input_output(
      CudaStreams streams, int_radix_params params, uint32_t num_luts,
      uint32_t num_radix_blocks,
      int_radix_lut_custom_input_output *base_lut_object,
      bool allocate_gpu_memory, uint64_t &size_tracker) {
    setup_config_and_degrees(streams, params.big_lwe_dimension, params,
                             num_luts, 1, num_radix_blocks, num_radix_blocks,
                             allocate_gpu_memory);

    setup_degrees();

    setup_mem_reuse(num_radix_blocks, base_lut_object);

    allocate_luts_and_indexes(num_radix_blocks, size_tracker);

    setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
                              size_tracker);

    setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
                    size_tracker);
  }

  // Construction for many luts
  int_radix_lut_custom_input_output(CudaStreams streams,
                                    int_radix_params params, uint32_t num_luts,
                                    uint32_t num_radix_blocks,
                                    uint32_t num_many_lut,
                                    bool allocate_gpu_memory,
                                    uint64_t &size_tracker) {

    setup_config_and_degrees(streams, params.big_lwe_dimension, params,
                             num_luts, num_many_lut, num_radix_blocks,
                             num_radix_blocks, allocate_gpu_memory);

    setup_degrees();

    allocate_pbs_buffers(params, num_radix_blocks, allocate_gpu_memory,
                         size_tracker);

    allocate_luts_and_indexes(num_radix_blocks, size_tracker);

    setup_lwe_trivial_indices(num_radix_blocks, allocate_gpu_memory,
                              size_tracker);

    setup_multi_gpu(params, num_radix_blocks, allocate_gpu_memory,
                    size_tracker);
  }

  // Return a pointer to idx-ith lut at gpu_index's global memory
  OutputTorus *get_lut(uint32_t gpu_index, size_t idx) {
    if (!gpu_memory_allocated)
      return nullptr;
    auto lut = lut_vec[gpu_index];
    size_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;

    if (lut == nullptr)
      PANIC("Cuda error: invalid lut pointer")
    return &lut[idx * lut_size];
  }

  // Return a pointer to idx-ith degree
  uint64_t *get_degree(size_t idx) {
    GPU_ASSERT(idx < num_luts, "Invalid degree requested");
    return &degrees[num_many_lut * idx];
  }

  // Return a pointer to idx-ith max degree
  uint64_t *get_max_degree(size_t idx) {
    GPU_ASSERT(idx < num_luts, "Invalid degree requested");
    return &max_degrees[idx];
  }

  // Return a pointer to idx-ith lut indexes at gpu_index's global memory
  InputTorus *get_lut_indexes(uint32_t gpu_index, size_t ind) {
    if (!gpu_memory_allocated)
      return nullptr;
    auto lut_indexes = lut_indexes_vec[gpu_index];
    return &lut_indexes[ind];
  }

  // Allocate LUT
  // LUT is used as a trivial encryption and must be initialized outside
  // this constructor
  void allocate_luts_and_indexes(uint32_t num_radix_blocks,
                                 uint64_t &size_tracker) {
    uint64_t lut_indexes_size = num_radix_blocks * sizeof(InputTorus);
    uint64_t lut_buffer_size = (params.glwe_dimension + 1) *
                               params.polynomial_size * sizeof(OutputTorus);

    for (uint i = 0; i < active_streams.count(); i++) {
      auto lut = (OutputTorus *)cuda_malloc_with_size_tracking_async(
          num_luts * lut_buffer_size, active_streams.stream(i),
          active_streams.gpu_index(i), size_tracker, gpu_memory_allocated);
      auto lut_indexes = (InputTorus *)cuda_malloc_with_size_tracking_async(
          lut_indexes_size, active_streams.stream(i),
          active_streams.gpu_index(i), size_tracker, gpu_memory_allocated);
      // lut_indexes is initialized to 0 by default
      // if a different behavior is wanted, it should be rewritten later
      cuda_memset_with_size_tracking_async(
          lut_indexes, 0, lut_indexes_size, active_streams.stream(i),
          active_streams.gpu_index(i), gpu_memory_allocated);
      lut_vec.push_back(lut);
      lut_indexes_vec.push_back(lut_indexes);
    }
  }
  // If this function is called we assume the lwe_indexes_(in/out) are not the
  // trivial anymore and thus we disable optimizations
  void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
                       InputTorus *h_indexes_in, InputTorus *h_indexes_out) {

    memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(InputTorus));
    memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(InputTorus));

    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes_in, h_lwe_indexes_in, num_blocks * sizeof(InputTorus),
        stream, gpu_index, gpu_memory_allocated);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes_out, h_lwe_indexes_out, num_blocks * sizeof(InputTorus),
        stream, gpu_index, gpu_memory_allocated);

    using_trivial_lwe_indexes = false;
  }

  // Broadcast luts from device gpu_indexes[0] to all active gpus
  void broadcast_lut(CudaStreams new_active_streams,
                     bool broadcast_lut_values = true) {
    PANIC_IF_FALSE(new_active_streams.gpu_index(0) ==
                       active_streams.gpu_index(0),
                   "Broadcasting LUTs can only be done using the same GPUs "
                   " originally assigned to the int_radix_lut");

    // We only do broadcast if there are more than 1 active GPU
    if (new_active_streams.count() == 1)
      return;

    GPU_ASSERT(active_streams.count() >= new_active_streams.count(),
               "To broadcast a LUT to a GPU set, it must have been initialized "
               "with a GPU set that is greater or equal in size");

    int active_device = cuda_get_device();

    uint64_t lut_size = (params.glwe_dimension + 1) * params.polynomial_size;

    // Wait for GPU 0 to receive all data from previous computations
    // that may have occurred on different GPUs
    multi_gpu_broadcast_barrier.local_streams_wait_for_stream_0(
        new_active_streams);

    // The LUT and its indexes reside on GPU 0
    // these were filled by calls to generate_device_accumulator
    // due to the previous synchronization, we're sure these buffers have
    // finished copying to GPU 0 from CPU
    auto src_lut = lut_vec[0];
    auto src_lut_indexes = lut_indexes_vec[0];

    for (uint i = 1; i < new_active_streams.count(); i++) {
      PANIC_IF_FALSE(
          new_active_streams.gpu_index(i) == active_streams.gpu_index(i),
          "Broadcasting LUTs can only be done to the LUT streams or to new "
          "streams that reside on the same GPUs as the source LUTs");

      // Check for redundant copies
#ifndef DEBUG_FAKE_MULTI_GPU
      PANIC_IF_FALSE(new_active_streams.gpu_index(i) !=
                         new_active_streams.gpu_index(0),
                     "Broadcast LUT does not handle duplicate GPUs in the "
                     "active streams set");
#endif

      if (broadcast_lut_values) {
        auto dst_lut = lut_vec[i];
        cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
            dst_lut, src_lut, num_luts * lut_size * sizeof(OutputTorus),
            new_active_streams.stream(i), new_active_streams.gpu_index(i),
            gpu_memory_allocated);
      }
      auto dst_lut_indexes = lut_indexes_vec[i];
      cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
          dst_lut_indexes, src_lut_indexes, num_blocks * sizeof(InputTorus),
          new_active_streams.stream(i), new_active_streams.gpu_index(i),
          gpu_memory_allocated);
    }

    // Ensure the device set at the end of this method is the same as it was
    // set at the beginning
    cuda_set_device(active_device);
  }

  void allocate_lwe_vector_for_non_trivial_indexes(
      CudaStreams streams, uint64_t max_num_radix_blocks,
      uint64_t &size_tracker, bool allocate_gpu_memory) {

    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

    // We need to create the auxiliary array only in GPU 0
    if (active_streams.count() > 1) {
      lwe_aligned_vec.resize(active_streams.count());
      for (uint i = 0; i < active_streams.count(); i++) {
        uint64_t size_tracker_on_array_i = 0;
        auto inputs_on_gpu = std::min(
            (int)max_num_radix_blocks,
            std::max(threshold, get_num_inputs_on_gpu(max_num_radix_blocks, i,
                                                      active_streams.count())));
        InputTorus *d_array =
            (InputTorus *)cuda_malloc_with_size_tracking_async(
                inputs_on_gpu * (params.big_lwe_dimension + 1) *
                    sizeof(InputTorus),
                streams.stream(0), streams.gpu_index(0),
                size_tracker_on_array_i, allocate_gpu_memory);
        lwe_aligned_vec[i] = d_array;
        size_tracker += size_tracker_on_array_i;
      }
    }
  }

  void release(CudaStreams streams) {
    PANIC_IF_FALSE(lut_indexes_vec.size() == lut_vec.size(),
                   "Lut vec and Lut vec indexes must have the same size");
    for (uint i = 0; i < lut_vec.size(); i++) {
      cuda_drop_with_size_tracking_async(lut_vec[i], active_streams.stream(i),
                                         active_streams.gpu_index(i),
                                         gpu_memory_allocated);
      cuda_drop_with_size_tracking_async(
          lut_indexes_vec[i], active_streams.stream(i),
          active_streams.gpu_index(i), gpu_memory_allocated);
    }
    cuda_drop_with_size_tracking_async(lwe_indexes_in, active_streams.stream(0),
                                       active_streams.gpu_index(0),
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(
        lwe_indexes_out, active_streams.stream(0), active_streams.gpu_index(0),
        gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(
        lwe_trivial_indexes, active_streams.stream(0),
        active_streams.gpu_index(0), gpu_memory_allocated);

    cuda_synchronize_stream(active_streams.stream(0),
                            active_streams.gpu_index(0));
    lut_vec.clear();
    lut_indexes_vec.clear();
    free(h_lwe_indexes_in);
    free(h_lwe_indexes_out);

    if (active_streams.count() > 1) {
      active_streams.synchronize();
      multi_gpu_gather_barrier.release();
      multi_gpu_broadcast_barrier.release();
      multi_gpu_scatter_barrier.release();
    }

    if (!mem_reuse) {
      release_radix_ciphertext_async(active_streams.stream(0),
                                     active_streams.gpu_index(0),
                                     tmp_lwe_before_ks, gpu_memory_allocated);
      for (int i = 0; i < buffer.size(); i++) {
        switch (params.pbs_type) {
        case MULTI_BIT:
          cleanup_cuda_multi_bit_programmable_bootstrap(
              active_streams.stream(i), active_streams.gpu_index(i),
              &buffer[i]);
          break;
        case CLASSICAL:
          cleanup_cuda_programmable_bootstrap(active_streams.stream(i),
                                              active_streams.gpu_index(i),
                                              &buffer[i]);
          break;
        default:
          PANIC("Cuda error (PBS): unknown PBS type. ")
        }
        cuda_synchronize_stream(active_streams.stream(i),
                                active_streams.gpu_index(i));
      }
      delete tmp_lwe_before_ks;
      buffer.clear();

      if (gpu_memory_allocated) {
        multi_gpu_release_async(active_streams, lwe_array_in_vec);
        multi_gpu_release_async(active_streams, lwe_after_ks_vec);
        multi_gpu_release_async(active_streams, lwe_after_pbs_vec);
        multi_gpu_release_async(active_streams, lwe_trivial_indexes_vec);
      }
      lwe_array_in_vec.clear();
      lwe_after_ks_vec.clear();
      lwe_after_pbs_vec.clear();
      lwe_trivial_indexes_vec.clear();
      if (lwe_aligned_vec.size() > 0) {
        for (uint i = 0; i < active_streams.count(); i++) {
          cuda_drop_with_size_tracking_async(
              lwe_aligned_vec[i], active_streams.stream(0),
              active_streams.gpu_index(0), gpu_memory_allocated);
        }
        lwe_aligned_vec.clear();
      }

      for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
        cleanup_cuda_keyswitch(active_streams.stream(i),
                               active_streams.gpu_index(i), ks_tmp_buf_vec[i],
                               gpu_memory_allocated);
      }
      ks_tmp_buf_vec.clear();
    }
    free(h_lut_indexes);
    free(degrees);
    free(max_degrees);
  }
};

template <typename Torus, typename OutputTorus = Torus>
using int_radix_lut = int_radix_lut_custom_input_output<Torus, Torus>;

template <typename InputTorus>
struct int_noise_squashing_lut
    : int_radix_lut_custom_input_output<InputTorus, __uint128_t> {

  std::vector<InputTorus *> lwe_aligned_scatter_vec;
  std::vector<__uint128_t *> lwe_aligned_gather_vec;
  // noise squashing constructor
  int_noise_squashing_lut(CudaStreams streams, int_radix_params params,
                          uint32_t input_glwe_dimension,
                          uint32_t input_polynomial_size,
                          uint32_t num_radix_blocks,
                          uint32_t original_num_blocks,
                          bool allocate_gpu_memory, uint64_t &size_tracker)

      : int_radix_lut_custom_input_output<InputTorus, __uint128_t>(
            streams, input_glwe_dimension * input_polynomial_size, params, 1,
            num_radix_blocks, original_num_blocks, allocate_gpu_memory,
            size_tracker) {

    // lut for the squashing
    auto f_squash = [](__uint128_t block) -> __uint128_t { return block; };

    generate_device_accumulator<__uint128_t>(
        this->active_streams.stream(0), this->active_streams.gpu_index(0),
        this->get_lut(0, 0), this->get_degree(0), this->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, f_squash, allocate_gpu_memory);

    this->broadcast_lut(this->active_streams);
  }

  using int_radix_lut_custom_input_output<InputTorus, __uint128_t>::release;
};

// Forward declarations for operation buffers
template <typename Torus> struct int_sub_and_propagate;

template <typename Torus> struct int_bit_extract_luts_buffer {
  int_radix_params params;
  int_radix_lut<Torus> *lut;
  bool gpu_memory_allocated;

  // With offset
  int_bit_extract_luts_buffer(CudaStreams streams, int_radix_params params,
                              uint32_t bits_per_block, uint32_t final_offset,
                              uint32_t num_radix_blocks,
                              bool allocate_gpu_memory,
                              uint64_t &size_tracker) {
    this->params = params;
    gpu_memory_allocated = allocate_gpu_memory;

    lut = new int_radix_lut<Torus>(streams, params, bits_per_block,
                                   bits_per_block * num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);

    for (int i = 0; i < bits_per_block; i++) {

      auto operator_f = [i, final_offset](Torus x) -> Torus {
        Torus y = (x >> i) & 1;
        return y << final_offset;
      };

      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), lut->get_lut(0, i),
          lut->get_degree(i), lut->get_max_degree(i), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          operator_f, gpu_memory_allocated);
    }

    /**
     * we have bits_per_blocks LUTs that should be used for all bits in all
     * blocks
     */
    Torus *h_lut_indexes = lut->h_lut_indexes;
    for (int j = 0; j < num_radix_blocks; j++) {
      for (int i = 0; i < bits_per_block; i++)
        h_lut_indexes[i + j * bits_per_block] = i;
    }
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lut->get_lut_indexes(0, 0), h_lut_indexes,
        num_radix_blocks * bits_per_block * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);

    auto active_streams = streams.active_gpu_subset(
        bits_per_block * num_radix_blocks, params.pbs_type);
    lut->broadcast_lut(active_streams);

    /**
     * the input indexes should take the first bits_per_block PBS to target
     * the block 0, then block 1, etc...
     */
    Torus *h_lwe_indexes_in =
        (Torus *)malloc(num_radix_blocks * bits_per_block * sizeof(Torus));

    for (int j = 0; j < num_radix_blocks; j++) {
      for (int i = 0; i < bits_per_block; i++)
        h_lwe_indexes_in[i + j * bits_per_block] = j;
    }

    /**
     * the output should aim different lwe ciphertexts, so lwe_indexes_out =
     * range(num_luts)
     */
    Torus *h_lwe_indexes_out =
        (Torus *)malloc(num_radix_blocks * bits_per_block * sizeof(Torus));

    for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
      h_lwe_indexes_out[i] = i;

    lut->set_lwe_indexes(streams.stream(0), streams.gpu_index(0),
                         h_lwe_indexes_in, h_lwe_indexes_out);
    lut->allocate_lwe_vector_for_non_trivial_indexes(
        active_streams, num_radix_blocks * bits_per_block, size_tracker,
        allocate_gpu_memory);

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_lwe_indexes_in);
    free(h_lwe_indexes_out);
  }

  // Without offset
  int_bit_extract_luts_buffer(CudaStreams streams, int_radix_params params,
                              uint32_t bits_per_block,
                              uint32_t num_radix_blocks,
                              bool allocate_gpu_memory, uint64_t &size_tracker)
      : int_bit_extract_luts_buffer(streams, params, bits_per_block, 0,
                                    num_radix_blocks, allocate_gpu_memory,
                                    size_tracker) {}

  void release(CudaStreams streams) {
    lut->release(streams);
    delete (lut);
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
};

template <typename Torus> struct int_fullprop_buffer {
  int_radix_params params;

  int_radix_lut<Torus> *lut;

  CudaRadixCiphertextFFI *tmp_small_lwe_vector;
  CudaRadixCiphertextFFI *tmp_big_lwe_vector;
  bool gpu_memory_allocated;

  int_fullprop_buffer(CudaStreams streams, int_radix_params params,
                      bool allocate_gpu_memory, uint64_t &size_tracker) {
    this->params = params;
    gpu_memory_allocated = allocate_gpu_memory;
    lut = new int_radix_lut<Torus>(streams.get_ith(0), params, 2, 2,
                                   allocate_gpu_memory, size_tracker);

    // LUTs
    auto lut_f_message = [params](Torus x) -> Torus {
      return x % params.message_modulus;
    };
    auto lut_f_carry = [params](Torus x) -> Torus {
      return x / params.message_modulus;
    };

    //
    Torus *lut_buffer_message = lut->get_lut(0, 0);
    uint64_t *message_degree = lut->get_degree(0);
    uint64_t *message_max_degree = lut->get_max_degree(0);
    Torus *lut_buffer_carry = lut->get_lut(0, 1);
    uint64_t *carry_degree = lut->get_degree(1);
    uint64_t *carry_max_degree = lut->get_max_degree(1);

    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), lut_buffer_message,
        message_degree, message_max_degree, params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        lut_f_message, gpu_memory_allocated);

    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), lut_buffer_carry, carry_degree,
        carry_max_degree, params.glwe_dimension, params.polynomial_size,
        params.message_modulus, params.carry_modulus, lut_f_carry,
        gpu_memory_allocated);

    uint64_t lwe_indexes_size = 2 * sizeof(Torus);
    Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
    for (int i = 0; i < 2; i++)
      h_lwe_indexes[i] = i;
    Torus *lwe_indexes = lut->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lwe_indexes, h_lwe_indexes, lwe_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    //
    // No broadcast is needed because full prop is done on 1 single GPU.
    //

    tmp_small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), tmp_small_lwe_vector, 2,
        params.small_lwe_dimension, size_tracker, allocate_gpu_memory);
    tmp_big_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), tmp_big_lwe_vector, 2,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
    free(h_lwe_indexes);
  }

  void release(CudaStreams streams) {

    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   tmp_small_lwe_vector, gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   tmp_big_lwe_vector, gpu_memory_allocated);
    lut->release(streams.get_ith(0));
    delete tmp_small_lwe_vector;
    delete tmp_big_lwe_vector;
    delete lut;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
};

template <typename Torus> struct int_sum_ciphertexts_vec_memory {

  int_radix_params params;
  uint32_t max_total_blocks_in_vec;
  uint32_t num_blocks_in_radix;
  uint32_t max_num_radix_in_vec;
  uint32_t chunk_size;
  bool gpu_memory_allocated;
  bool reduce_degrees_for_single_carry_propagation;

  // temporary buffers
  CudaRadixCiphertextFFI *current_blocks;
  CudaRadixCiphertextFFI *small_lwe_vector;

  uint32_t *d_columns_data;
  uint32_t *d_columns_counter;
  uint32_t **d_columns;

  uint32_t *d_new_columns_data;
  uint32_t *d_new_columns_counter;
  uint32_t **d_new_columns;

  uint64_t *d_degrees;

  // lookup table for extracting message and carry
  int_radix_lut<Torus> *luts_message_carry;

  bool mem_reuse = false;
  bool allocated_luts_message_carry;

  void setup_index_buffers(CudaStreams streams, uint64_t &size_tracker) {

    d_degrees = (uint64_t *)cuda_malloc_with_size_tracking_async(
        max_total_blocks_in_vec * sizeof(uint64_t), streams.stream(0),
        streams.gpu_index(0), size_tracker, gpu_memory_allocated);

    auto num_blocks_in_radix = this->num_blocks_in_radix;
    auto max_num_radix_in_vec = this->max_num_radix_in_vec;
    auto setup_columns = [num_blocks_in_radix, max_num_radix_in_vec, streams](
                             uint32_t **&columns, uint32_t *&columns_data,
                             uint32_t *&columns_counter, uint64_t &size_tracker,
                             bool gpu_memory_allocated) {
      columns_data = (uint32_t *)cuda_malloc_with_size_tracking_async(
          num_blocks_in_radix * max_num_radix_in_vec * sizeof(uint32_t),
          streams.stream(0), streams.gpu_index(0), size_tracker,
          gpu_memory_allocated);
      columns_counter = (uint32_t *)cuda_malloc_with_size_tracking_async(
          num_blocks_in_radix * sizeof(uint32_t), streams.stream(0),
          streams.gpu_index(0), size_tracker, gpu_memory_allocated);
      cuda_memset_with_size_tracking_async(
          columns_counter, 0, num_blocks_in_radix * sizeof(uint32_t),
          streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
      uint32_t **h_columns = new uint32_t *[num_blocks_in_radix];
      for (int i = 0; i < num_blocks_in_radix; ++i) {
        h_columns[i] = columns_data + i * max_num_radix_in_vec;
      }
      columns = (uint32_t **)cuda_malloc_with_size_tracking_async(
          num_blocks_in_radix * sizeof(uint32_t *), streams.stream(0),
          streams.gpu_index(0), size_tracker, gpu_memory_allocated);
      if (gpu_memory_allocated) {
        cuda_memcpy_async_to_gpu(columns, h_columns,
                                 num_blocks_in_radix * sizeof(uint32_t *),
                                 streams.stream(0), streams.gpu_index(0));
      }
      cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
      delete[] h_columns;
    };

    setup_columns(d_columns, d_columns_data, d_columns_counter, size_tracker,
                  gpu_memory_allocated);
    setup_columns(d_new_columns, d_new_columns_data, d_new_columns_counter,
                  size_tracker, gpu_memory_allocated);
  }

  void setup_lookup_tables(CudaStreams streams, uint32_t num_radix_in_vec,
                           const uint64_t *const degrees) {
    uint32_t message_modulus = params.message_modulus;
    bool _needs_processing = false;
    radix_columns current_columns(degrees, num_blocks_in_radix,
                                  num_radix_in_vec, chunk_size,
                                  _needs_processing);
    uint32_t total_ciphertexts = 0;
    uint32_t total_messages = 0;
    current_columns.next_accumulation(total_ciphertexts, total_messages,
                                      _needs_processing);
    uint32_t pbs_count = std::max(total_ciphertexts, 2 * num_blocks_in_radix);
    if (!mem_reuse) {
      if (total_ciphertexts > 0 ||
          reduce_degrees_for_single_carry_propagation) {
        uint64_t size_tracker = 0;
        luts_message_carry = new int_radix_lut<Torus>(
            streams, params, 2, pbs_count, true, size_tracker);
        allocated_luts_message_carry = true;
        uint64_t message_modulus_bits =
            (uint64_t)std::log2(params.message_modulus);
        uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
        uint64_t total_bits_per_block =
            message_modulus_bits + carry_modulus_bits;
        uint64_t denominator =
            (uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
                                (pow(2, message_modulus_bits) - 1));

        uint64_t upper_bound_num_blocks =
            max_total_blocks_in_vec * 2 / denominator;
        luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
            streams, upper_bound_num_blocks, size_tracker, true);
      }
    }
    if (allocated_luts_message_carry) {
      auto message_acc = luts_message_carry->get_lut(0, 0);
      auto carry_acc = luts_message_carry->get_lut(0, 1);

      // define functions for each accumulator
      auto lut_f_message = [message_modulus](Torus x) -> Torus {
        return x % message_modulus;
      };
      auto lut_f_carry = [message_modulus](Torus x) -> Torus {
        return x / message_modulus;
      };

      // generate accumulators
      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), message_acc,
          luts_message_carry->get_degree(0),
          luts_message_carry->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, message_modulus, params.carry_modulus,
          lut_f_message, gpu_memory_allocated);
      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), carry_acc,
          luts_message_carry->get_degree(1),
          luts_message_carry->get_max_degree(1), params.glwe_dimension,
          params.polynomial_size, message_modulus, params.carry_modulus,
          lut_f_carry, gpu_memory_allocated);
      auto active_gpu_count_mc =
          streams.active_gpu_subset(pbs_count, params.pbs_type);
      luts_message_carry->broadcast_lut(active_gpu_count_mc);
    }
  }
  int_sum_ciphertexts_vec_memory(
      CudaStreams streams, int_radix_params params,
      uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
      bool reduce_degrees_for_single_carry_propagation,
      bool allocate_gpu_memory, uint64_t &size_tracker) {
    this->params = params;
    this->mem_reuse = false;
    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
    this->num_blocks_in_radix = num_blocks_in_radix;
    this->max_num_radix_in_vec = max_num_radix_in_vec;
    this->gpu_memory_allocated = allocate_gpu_memory;
    this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
                       (params.message_modulus - 1);
    this->allocated_luts_message_carry = false;
    this->reduce_degrees_for_single_carry_propagation =
        reduce_degrees_for_single_carry_propagation;

    setup_index_buffers(streams, size_tracker);
    // because we setup_lut in host function for sum_ciphertexts to save memory
    // the size_tracker is topped up here to have a max bound on the used memory
    uint32_t max_pbs_count = std::max(
        2 * (max_total_blocks_in_vec / chunk_size), 2 * num_blocks_in_radix);
    if (max_pbs_count > 0) {
      int_radix_lut<Torus> *luts_message_carry_dry_run =
          new int_radix_lut<Torus>(streams, params, 2, max_pbs_count, false,
                                   size_tracker);
      luts_message_carry_dry_run->release(streams);
      delete luts_message_carry_dry_run;
    }

    // create and allocate intermediate buffers
    current_blocks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), current_blocks,
        max_total_blocks_in_vec, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);
    small_lwe_vector = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), small_lwe_vector,
        max_total_blocks_in_vec, params.small_lwe_dimension, size_tracker,
        allocate_gpu_memory);
  }

  int_sum_ciphertexts_vec_memory(
      CudaStreams streams, int_radix_params params,
      uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
      CudaRadixCiphertextFFI *current_blocks,
      CudaRadixCiphertextFFI *small_lwe_vector,
      int_radix_lut<Torus> *reused_lut,
      bool reduce_degrees_for_single_carry_propagation,
      bool allocate_gpu_memory, uint64_t &size_tracker) {
    this->mem_reuse = true;
    this->params = params;
    this->max_total_blocks_in_vec = num_blocks_in_radix * max_num_radix_in_vec;
    this->num_blocks_in_radix = num_blocks_in_radix;
    this->max_num_radix_in_vec = max_num_radix_in_vec;
    this->gpu_memory_allocated = allocate_gpu_memory;
    this->chunk_size = (params.message_modulus * params.carry_modulus - 1) /
                       (params.message_modulus - 1);
    this->allocated_luts_message_carry = true;
    this->reduce_degrees_for_single_carry_propagation =
        reduce_degrees_for_single_carry_propagation;

    this->current_blocks = current_blocks;
    this->small_lwe_vector = small_lwe_vector;
    this->luts_message_carry = reused_lut;

    uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus);
    uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
    uint64_t total_bits_per_block = message_modulus_bits + carry_modulus_bits;
    uint64_t denominator =
        (uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
                            (pow(2, message_modulus_bits) - 1));

    uint64_t upper_bound_num_blocks = max_total_blocks_in_vec * 2 / denominator;
    this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
        streams, upper_bound_num_blocks, size_tracker, allocate_gpu_memory);
    setup_index_buffers(streams, size_tracker);
  }

  void release(CudaStreams streams) {
    cuda_drop_with_size_tracking_async(d_degrees, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_columns_data, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_columns_counter, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_columns, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);

    cuda_drop_with_size_tracking_async(d_new_columns_data, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_new_columns_counter, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(d_new_columns, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);

    if (!mem_reuse) {
      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                     current_blocks, gpu_memory_allocated);
      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                     small_lwe_vector, gpu_memory_allocated);
      if (allocated_luts_message_carry) {
        luts_message_carry->release(streams);
        delete luts_message_carry;
      }
      delete current_blocks;
      delete small_lwe_vector;
    }
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
};

// For sequential algorithm in group propagation
template <typename Torus> struct int_seq_group_prop_memory {

  CudaRadixCiphertextFFI *group_resolved_carries;
  int_radix_lut<Torus> *lut_sequential_algorithm;
  uint32_t grouping_size;
  bool gpu_memory_allocated;

  int_seq_group_prop_memory(CudaStreams streams, int_radix_params params,
                            uint32_t group_size, uint32_t big_lwe_size_bytes,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;

    grouping_size = group_size;
    group_resolved_carries = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), group_resolved_carries,
        grouping_size, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

    int num_seq_luts = grouping_size - 1;
    Torus *h_seq_lut_indexes = (Torus *)malloc(num_seq_luts * sizeof(Torus));
    lut_sequential_algorithm =
        new int_radix_lut<Torus>(streams, params, num_seq_luts, num_seq_luts,
                                 allocate_gpu_memory, size_tracker);
    for (int index = 0; index < num_seq_luts; index++) {
      auto f_lut_sequential = [index](Torus propa_cum_sum_block) {
        return (propa_cum_sum_block >> (index + 1)) & 1;
      };
      auto seq_lut = lut_sequential_algorithm->get_lut(0, index);
      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), seq_lut,
          lut_sequential_algorithm->get_degree(index),
          lut_sequential_algorithm->get_max_degree(index), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_lut_sequential,
          gpu_memory_allocated);
      h_seq_lut_indexes[index] = index;
    }
    Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_seq_luts, params.pbs_type);
    lut_sequential_algorithm->broadcast_lut(active_streams);
    free(h_seq_lut_indexes);
  };
  void release(CudaStreams streams) {
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   group_resolved_carries,
                                   gpu_memory_allocated);
    lut_sequential_algorithm->release(streams);
    delete group_resolved_carries;
    delete lut_sequential_algorithm;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  };
};

// For hillis steele algorithm in group propagation
template <typename Torus> struct int_hs_group_prop_memory {

  int_radix_lut<Torus> *lut_hillis_steele;
  bool gpu_memory_allocated;

  int_hs_group_prop_memory(CudaStreams streams, int_radix_params params,
                           uint32_t num_groups, uint32_t big_lwe_size_bytes,
                           bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;

    auto f_lut_hillis_steele = [](Torus msb, Torus lsb) -> Torus {
      if (msb == 2) {
        return 1; // Remap Generate to 1
      } else if (msb == 3) {
        // MSB propagates
        if (lsb == 2) {
          return 1;
        } else {
          return lsb;
        } // also remap here
      } else {
        return msb;
      }
    };

    lut_hillis_steele = new int_radix_lut<Torus>(
        streams, params, 1, num_groups, allocate_gpu_memory, size_tracker);

    generate_device_accumulator_bivariate<Torus>(
        streams.stream(0), streams.gpu_index(0),
        lut_hillis_steele->get_lut(0, 0), lut_hillis_steele->get_degree(0),
        lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_lut_hillis_steele,
        gpu_memory_allocated);
    auto active_streams =
        streams.active_gpu_subset(num_groups, params.pbs_type);
    lut_hillis_steele->broadcast_lut(active_streams);
  };
  void release(CudaStreams streams) {

    lut_hillis_steele->release(streams);
    delete lut_hillis_steele;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  }
};

// compute_shifted_blocks_and_block_states
template <typename Torus> struct int_shifted_blocks_and_states_memory {
  CudaRadixCiphertextFFI *shifted_blocks_and_states;
  CudaRadixCiphertextFFI *shifted_blocks;
  CudaRadixCiphertextFFI *block_states;

  int_radix_lut<Torus> *luts_array_first_step;
  bool gpu_memory_allocated;

  int_shifted_blocks_and_states_memory(
      CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
      uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
      uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;

    shifted_blocks_and_states = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), shifted_blocks_and_states,
        num_many_lut * num_radix_blocks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);
    shifted_blocks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), shifted_blocks,
        num_radix_blocks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);
    block_states = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), block_states, num_radix_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

    uint32_t num_luts_first_step = 2 * grouping_size + 1;

    luts_array_first_step = new int_radix_lut<Torus>(
        streams, params, num_luts_first_step, num_radix_blocks, num_many_lut,
        allocate_gpu_memory, size_tracker);

    auto f_shift_block = [message_modulus](Torus block) -> Torus {
      return (block % message_modulus) << 1;
    };

    auto f_first_block_state = [message_modulus](Torus block) -> Torus {
      if (block >= message_modulus)
        return OUTPUT_CARRY::GENERATED;
      else {
        return OUTPUT_CARRY::NONE;
      }
    };
    std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
        f_first_block_state, f_shift_block};

    auto first_block_lut = luts_array_first_step->get_lut(0, 0);
    auto first_block_lut_degrees = luts_array_first_step->get_degree(0);
    auto first_block_lut_max_degree = luts_array_first_step->get_max_degree(0);
    generate_many_lut_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), first_block_lut,
        first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
        gpu_memory_allocated);

    // luts for other blocks of the first grouping
    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
      auto f_state = [message_modulus, lut_id](Torus block) -> Torus {
        uint64_t r = 0;
        if (block >= message_modulus) {
          r = 2; // Generates Carry
        } else if (block == (message_modulus - 1)) {
          r = 1; // Propagates a carry
        } else {
          r = 0; // Does not generate carry
        }
        return r << (lut_id - 1);
      };
      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
          f_state, f_shift_block};
      auto lut = luts_array_first_step->get_lut(0, lut_id);
      auto lut_degrees = luts_array_first_step->get_degree(lut_id);
      auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
      generate_many_lut_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
          carry_modulus, f_grouping_luts, gpu_memory_allocated);
    }

    // luts for the rest of groupings (except for the last block)
    for (int i = 0; i < grouping_size; i++) {
      uint32_t lut_id = i + grouping_size;
      auto f_state = [message_modulus, i](Torus block) -> Torus {
        uint64_t r = 0;
        if (block >= message_modulus) {
          r = 2; // Generates Carry
        } else if (block == (message_modulus - 1)) {
          r = 1; // Propagates a carry
        } else {
          r = 0; // Does not borrow
        }
        return r << i;
      };
      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
          f_state, f_shift_block};

      auto lut = luts_array_first_step->get_lut(0, lut_id);
      auto lut_degrees = luts_array_first_step->get_degree(lut_id);
      auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
      generate_many_lut_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
          carry_modulus, f_grouping_luts, gpu_memory_allocated);
    }

    // For the last block we need to generate a new lut
    auto f_last_block_state = [message_modulus](Torus block) -> Torus {
      if (block >= message_modulus)
        return 2 << 1; // Generates
      else
        return 0; // Nothing
    };

    uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step

    auto last_block_lut = luts_array_first_step->get_lut(0, lut_id);
    auto last_block_lut_degrees = luts_array_first_step->get_degree(lut_id);
    auto last_block_lut_max_degree =
        luts_array_first_step->get_max_degree(lut_id);

    std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
        f_last_block_state, f_shift_block};

    generate_many_lut_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), last_block_lut,
        last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
        gpu_memory_allocated);

    // Generate the indexes to switch between luts within the pbs
    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);

    Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;
    for (int index = 0; index < num_radix_blocks; index++) {
      uint32_t grouping_index = index / grouping_size;
      bool is_in_first_grouping = (grouping_index == 0);
      uint32_t index_in_grouping = index % grouping_size;
      bool is_last_index = (index == (num_radix_blocks - 1));
      if (is_last_index) {
        if (num_radix_blocks == 1) {
          h_lut_indexes[index] = 2 * grouping_size;
        } else {
          h_lut_indexes[index] = 2;
        }
      } else if (is_in_first_grouping) {
        h_lut_indexes[index] = index_in_grouping;
      } else {
        h_lut_indexes[index] = index_in_grouping + grouping_size;
      }
    }

    // copy the indexes to the gpu
    Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_first_step->broadcast_lut(active_streams);
  };
  void release(CudaStreams streams) {

    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   shifted_blocks_and_states,
                                   gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   shifted_blocks, gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   block_states, gpu_memory_allocated);

    luts_array_first_step->release(streams);
    delete luts_array_first_step;
    delete shifted_blocks_and_states;
    delete shifted_blocks;
    delete block_states;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  };
};

// compute_propagation simulator and group carries
template <typename Torus> struct int_prop_simu_group_carries_memory {
  CudaRadixCiphertextFFI *propagation_cum_sums;
  CudaRadixCiphertextFFI *simulators;
  CudaRadixCiphertextFFI *prepared_blocks;
  CudaRadixCiphertextFFI *grouping_pgns;
  CudaRadixCiphertextFFI *resolved_carries;

  Torus *scalar_array_cum_sum;
  Torus *h_scalar_array_cum_sum;

  int_radix_lut<Torus> *luts_array_second_step;

  int_seq_group_prop_memory<Torus> *seq_group_prop_mem;
  int_hs_group_prop_memory<Torus> *hs_group_prop_mem;

  uint32_t group_size;
  bool use_sequential_algorithm_to_resolve_group_carries;
  bool gpu_memory_allocated;

  int_prop_simu_group_carries_memory(
      CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
      uint32_t grouping_size, uint32_t num_groups, bool allocate_gpu_memory,
      uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;
    auto big_lwe_size = (polynomial_size * glwe_dimension + 1);
    auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

    uint32_t block_modulus = message_modulus * carry_modulus;
    uint32_t num_bits_in_block = std::log2(block_modulus);

    group_size = grouping_size;

    propagation_cum_sums = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), propagation_cum_sums,
        num_radix_blocks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);
    simulators = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), simulators, num_radix_blocks,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    prepared_blocks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), prepared_blocks,
        num_radix_blocks + 1, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);
    resolved_carries = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), resolved_carries,
        num_groups + 1, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);
    grouping_pgns = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), grouping_pgns, num_groups,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    scalar_array_cum_sum = (Torus *)cuda_malloc_with_size_tracking_async(
        num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), size_tracker, allocate_gpu_memory);
    cuda_memset_with_size_tracking_async(
        scalar_array_cum_sum, 0, num_radix_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
    h_scalar_array_cum_sum = new Torus[num_radix_blocks]();

    // create lut objects for step 2
    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
    uint32_t num_carry_to_resolve = num_groups - 1;
    uint32_t saturated_sub =
        ((num_carry_to_resolve > 1) ? num_carry_to_resolve - 1 : 0);
    uint32_t sequential_depth = saturated_sub / (grouping_size - 1);
    uint32_t hillis_steel_depth;

    if (num_carry_to_resolve == 0) {
      hillis_steel_depth = 0;
    } else {
      hillis_steel_depth = std::ceil(std::log2(num_carry_to_resolve));
    }

    use_sequential_algorithm_to_resolve_group_carries =
        sequential_depth <= hillis_steel_depth;
    uint32_t num_extra_luts = 0;
    if (use_sequential_algorithm_to_resolve_group_carries) {
      num_extra_luts = (grouping_size - 1);
    } else {
      num_extra_luts = 1;
    }

    uint32_t num_luts_second_step = 2 * grouping_size + num_extra_luts;
    luts_array_second_step = new int_radix_lut<Torus>(
        streams, params, num_luts_second_step, num_radix_blocks,
        allocate_gpu_memory, size_tracker);

    // luts for first group inner propagation
    for (int lut_id = 0; lut_id < grouping_size - 1; lut_id++) {
      auto f_first_grouping_inner_propagation =
          [lut_id](Torus propa_cum_sum_block) -> Torus {
        uint64_t carry = (propa_cum_sum_block >> lut_id) & 1;

        if (carry != 0) {
          return 2ull; // Generates Carry
        } else {
          return 0ull; // Does not generate carry
        }
      };

      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0),
          luts_array_second_step->get_lut(0, lut_id),
          luts_array_second_step->get_degree(lut_id),
          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus,
          f_first_grouping_inner_propagation, gpu_memory_allocated);
    }

    auto f_first_grouping_outer_propagation =
        [num_bits_in_block](Torus block) -> Torus {
      return (block >> (num_bits_in_block - 1)) & 1;
    };

    int lut_id = grouping_size - 1;
    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0),
        luts_array_second_step->get_lut(0, lut_id),
        luts_array_second_step->get_degree(lut_id),
        luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus,
        f_first_grouping_outer_propagation, gpu_memory_allocated);

    // for other groupings inner propagation
    for (int index = 0; index < grouping_size; index++) {
      uint32_t lut_id = index + grouping_size;

      auto f_other_groupings_inner_propagation =
          [index](Torus propa_cum_sum_block) -> Torus {
        uint64_t mask = (2 << index) - 1;
        if (propa_cum_sum_block >= (2 << index)) {
          return 2ull; // Generates
        } else if ((propa_cum_sum_block & mask) == mask) {
          return 1ull; // Propagate
        } else {
          return 0ull; // Nothing
        }
      };

      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0),
          luts_array_second_step->get_lut(0, lut_id),
          luts_array_second_step->get_degree(lut_id),
          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus,
          f_other_groupings_inner_propagation, gpu_memory_allocated);
    }

    if (use_sequential_algorithm_to_resolve_group_carries) {
      for (int index = 0; index < grouping_size - 1; index++) {
        uint32_t lut_id = index + 2 * grouping_size;

        auto f_group_propagation = [index, block_modulus,
                                    num_bits_in_block](Torus block) -> Torus {
          if (block == (block_modulus - 1)) {
            return 0ull;
          } else {
            return ((UINT64_MAX << index) % (1ull << (num_bits_in_block + 1)));
          }
        };

        generate_device_accumulator<Torus>(
            streams.stream(0), streams.gpu_index(0),
            luts_array_second_step->get_lut(0, lut_id),
            luts_array_second_step->get_degree(lut_id),
            luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
            polynomial_size, message_modulus, carry_modulus,
            f_group_propagation, gpu_memory_allocated);
      }
    } else {
      uint32_t lut_id = 2 * grouping_size;
      auto f_group_propagation = [block_modulus](Torus block) {
        if (block == (block_modulus - 1)) {
          return 2ull;
        } else {
          return UINT64_MAX % (block_modulus * 2ull);
        }
      };

      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0),
          luts_array_second_step->get_lut(0, lut_id),
          luts_array_second_step->get_degree(lut_id),
          luts_array_second_step->get_max_degree(lut_id), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_group_propagation,
          gpu_memory_allocated);
    }

    Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size);

    for (int index = 0; index < num_radix_blocks; index++) {
      uint32_t grouping_index = index / grouping_size;
      bool is_in_first_grouping = (grouping_index == 0);
      uint32_t index_in_grouping = index % grouping_size;

      if (is_in_first_grouping) {
        h_second_lut_indexes[index] = index_in_grouping;
      } else if (index_in_grouping == (grouping_size - 1)) {
        if (use_sequential_algorithm_to_resolve_group_carries) {
          int inner_index = (grouping_index - 1) % (grouping_size - 1);
          h_second_lut_indexes[index] = inner_index + 2 * grouping_size;
        } else {
          h_second_lut_indexes[index] = 2 * grouping_size;
        }
      } else {
        h_second_lut_indexes[index] = index_in_grouping + grouping_size;
      }

      bool may_have_its_padding_bit_set =
          !is_in_first_grouping && (index_in_grouping == grouping_size - 1);

      if (may_have_its_padding_bit_set) {
        if (use_sequential_algorithm_to_resolve_group_carries) {
          h_scalar_array_cum_sum[index] =
              1 << ((grouping_index - 1) % (grouping_size - 1));
        } else {
          h_scalar_array_cum_sum[index] = 1;
        }
      } else {
        h_scalar_array_cum_sum[index] = 0;
      }
    }

    // copy the indexes to the gpu
    Torus *second_lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        second_lut_indexes, h_second_lut_indexes, lut_indexes_size,
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);

    cuda_memcpy_with_size_tracking_async_to_gpu(
        scalar_array_cum_sum, h_scalar_array_cum_sum,
        num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_second_step->broadcast_lut(active_streams);

    if (use_sequential_algorithm_to_resolve_group_carries) {

      seq_group_prop_mem = new int_seq_group_prop_memory<Torus>(
          streams, params, grouping_size, big_lwe_size_bytes,
          allocate_gpu_memory, size_tracker);

    } else {
      hs_group_prop_mem = new int_hs_group_prop_memory<Torus>(
          streams, params, num_groups, big_lwe_size_bytes, allocate_gpu_memory,
          size_tracker);
    }

    free(h_second_lut_indexes);
  };

  // needed for the division to update the lut indexes
  void update_lut_indexes(CudaStreams streams, Torus *new_lut_indexes,
                          Torus *new_scalars, uint32_t new_num_blocks) {
    Torus *lut_indexes = luts_array_second_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
    auto new_active_streams = streams.active_gpu_subset(
        new_num_blocks, luts_array_second_step->params.pbs_type);
    // We just need to update the lut indexes so we use false here
    luts_array_second_step->broadcast_lut(new_active_streams, false);

    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        scalar_array_cum_sum, new_scalars, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
  }

  void release(CudaStreams streams) {
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   propagation_cum_sums, gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   simulators, gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   grouping_pgns, gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   prepared_blocks, gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   resolved_carries, gpu_memory_allocated);
    cuda_drop_with_size_tracking_async(scalar_array_cum_sum, streams.stream(0),
                                       streams.gpu_index(0),
                                       gpu_memory_allocated);
    luts_array_second_step->release(streams);

    if (use_sequential_algorithm_to_resolve_group_carries) {
      seq_group_prop_mem->release(streams);
      delete seq_group_prop_mem;
    } else {
      hs_group_prop_mem->release(streams);
      delete hs_group_prop_mem;
    }

    delete propagation_cum_sums;
    delete simulators;
    delete grouping_pgns;
    delete prepared_blocks;
    delete resolved_carries;
    delete luts_array_second_step;
    delete[] h_scalar_array_cum_sum;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  };
};

template <typename Torus> struct int_sc_prop_memory {
  uint32_t num_many_lut;
  uint32_t lut_stride;

  uint32_t num_groups;
  CudaRadixCiphertextFFI *output_flag;
  CudaRadixCiphertextFFI *last_lhs;
  CudaRadixCiphertextFFI *last_rhs;
  int_radix_lut<Torus> *lut_message_extract;

  int_radix_lut<Torus> *lut_overflow_flag_prep;

  int_shifted_blocks_and_states_memory<Torus> *shifted_blocks_state_mem;
  int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;

  int_radix_params params;
  uint32_t requested_flag;
  bool gpu_memory_allocated;

  int_sc_prop_memory(CudaStreams streams, int_radix_params params,
                     uint32_t num_radix_blocks, uint32_t requested_flag_in,
                     bool allocate_gpu_memory, uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;
    requested_flag = requested_flag_in;
    // for compute shifted blocks and block states
    uint32_t block_modulus = message_modulus * carry_modulus;
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;

    num_many_lut = 2; // many luts apply 2 luts
    uint32_t box_size = polynomial_size / block_modulus;
    lut_stride = (block_modulus / num_many_lut) * box_size;

    shifted_blocks_state_mem = new int_shifted_blocks_and_states_memory<Torus>(
        streams, params, num_radix_blocks, num_many_lut, grouping_size,
        allocate_gpu_memory, size_tracker);

    prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
        streams, params, num_radix_blocks, grouping_size, num_groups,
        allocate_gpu_memory, size_tracker);

    //  Step 3 elements
    int num_luts_message_extract =
        requested_flag == outputFlag::FLAG_NONE ? 1 : 2;
    lut_message_extract = new int_radix_lut<Torus>(
        streams, params, num_luts_message_extract, num_radix_blocks + 1,
        allocate_gpu_memory, size_tracker);
    // lut for the first block in the first grouping
    auto f_message_extract = [message_modulus](Torus block) -> Torus {
      return (block >> 1) % message_modulus;
    };

    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0),
        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_message_extract,
        gpu_memory_allocated);

    // This store a single block that with be used to store the overflow or
    // carry results
    output_flag = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), output_flag,
        num_radix_blocks + 1, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

    if (requested_flag == outputFlag::FLAG_OVERFLOW) {
      last_lhs = new CudaRadixCiphertextFFI;
      last_rhs = new CudaRadixCiphertextFFI;
      create_zero_radix_ciphertext_async<Torus>(
          streams.stream(0), streams.gpu_index(0), last_lhs, 1,
          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
      create_zero_radix_ciphertext_async<Torus>(
          streams.stream(0), streams.gpu_index(0), last_rhs, 1,
          params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

      // For step 1 overflow should be enable only if flag overflow
      uint32_t num_bits_in_message = std::log2(message_modulus);
      lut_overflow_flag_prep = new int_radix_lut<Torus>(
          streams, params, 1, 1, allocate_gpu_memory, size_tracker);

      auto f_overflow_fp = [num_bits_in_message](Torus lhs,
                                                 Torus rhs) -> Torus {
        Torus mask = (1 << (num_bits_in_message - 1)) - 1;
        Torus lhs_except_last_bit = lhs & mask;
        Torus rhs_except_last_bit = rhs & mask;
        Torus input_carry1 = 1;
        Torus input_carry2 = 0;

        Torus output_carry1 =
            ((lhs + rhs + input_carry1) >> num_bits_in_message) & 1;
        Torus output_carry2 =
            ((lhs + rhs + input_carry2) >> num_bits_in_message) & 1;
        Torus input_carry_last_bit1 =
            ((lhs_except_last_bit + rhs_except_last_bit + input_carry1) >>
             (num_bits_in_message - 1)) &
            1;
        Torus input_carry_last_bit2 =
            ((lhs_except_last_bit + rhs_except_last_bit + input_carry2) >>
             (num_bits_in_message - 1)) &
            1;

        Torus output1 = (Torus)(input_carry_last_bit1 != output_carry1);
        Torus output2 = (Torus)(input_carry_last_bit2 != output_carry2);

        return output1 << 3 | output2 << 2;
      };

      generate_device_accumulator_bivariate<Torus>(
          streams.stream(0), streams.gpu_index(0),
          lut_overflow_flag_prep->get_lut(0, 0),
          lut_overflow_flag_prep->get_degree(0),
          lut_overflow_flag_prep->get_max_degree(0), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
          gpu_memory_allocated);

      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      lut_overflow_flag_prep->broadcast_lut(active_streams);
    }

    // For the final cleanup in case of overflow or carry (it seems that I can)
    // It seems that this lut could be apply together with the other one but for
    // now we won't do it
    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // Overflow case
      auto f_overflow_last = [num_radix_blocks,
                              requested_flag_in](Torus block) -> Torus {
        uint32_t position = (num_radix_blocks == 1 &&
                             requested_flag_in == outputFlag::FLAG_OVERFLOW)
                                ? 0
                                : 1;
        Torus input_carry = (block >> position) & 1;
        Torus does_overflow_if_carry_is_1 = (block >> 3) & 1;
        Torus does_overflow_if_carry_is_0 = (block >> 2) & 1;
        if (input_carry == outputFlag::FLAG_OVERFLOW) {
          return does_overflow_if_carry_is_1;
        } else {
          return does_overflow_if_carry_is_0;
        }
      };

      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0),
          lut_message_extract->get_lut(0, 1),
          lut_message_extract->get_degree(1),
          lut_message_extract->get_max_degree(1), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_overflow_last,
          gpu_memory_allocated);

      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
      for (int index = 0; index < num_radix_blocks + 1; index++) {
        if (index < num_radix_blocks) {
          h_lut_indexes[index] = 0;
        } else {
          h_lut_indexes[index] = 1;
        }
      }
      cuda_memcpy_with_size_tracking_async_to_gpu(
          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
          streams.gpu_index(0), allocate_gpu_memory);
    }
    if (requested_flag == outputFlag::FLAG_CARRY) { // Carry case

      auto f_carry_last = [](Torus block) -> Torus {
        return ((block >> 2) & 1);
      };

      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0),
          lut_message_extract->get_lut(0, 1),
          lut_message_extract->get_degree(1),
          lut_message_extract->get_max_degree(1), glwe_dimension,
          polynomial_size, message_modulus, carry_modulus, f_carry_last,
          gpu_memory_allocated);

      Torus *h_lut_indexes = lut_message_extract->h_lut_indexes;
      for (int index = 0; index < num_radix_blocks + 1; index++) {
        if (index < num_radix_blocks) {
          h_lut_indexes[index] = 0;
        } else {
          h_lut_indexes[index] = 1;
        }
      }
      cuda_memcpy_with_size_tracking_async_to_gpu(
          lut_message_extract->get_lut_indexes(0, 0), h_lut_indexes,
          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
          streams.gpu_index(0), allocate_gpu_memory);
    }
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
    lut_message_extract->broadcast_lut(active_streams);
  };

  void release(CudaStreams streams) {

    shifted_blocks_state_mem->release(streams);
    prop_simu_group_carries_mem->release(streams);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   output_flag, gpu_memory_allocated);
    lut_message_extract->release(streams);
    delete shifted_blocks_state_mem;
    delete prop_simu_group_carries_mem;
    delete output_flag;
    delete lut_message_extract;

    if (requested_flag == outputFlag::FLAG_OVERFLOW) { // In case of overflow
      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                     last_lhs, gpu_memory_allocated);
      release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                     last_rhs, gpu_memory_allocated);
      lut_overflow_flag_prep->release(streams);
      delete lut_overflow_flag_prep;
      delete last_lhs;
      delete last_rhs;
    }
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  };
};

template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
  CudaRadixCiphertextFFI *shifted_blocks_and_borrow_states;
  CudaRadixCiphertextFFI *shifted_blocks;
  CudaRadixCiphertextFFI *borrow_states;

  int_radix_lut<Torus> *luts_array_first_step;
  bool gpu_memory_allocated;

  int_shifted_blocks_and_borrow_states_memory(
      CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks,
      uint32_t num_many_lut, uint32_t grouping_size, bool allocate_gpu_memory,
      uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;

    shifted_blocks_and_borrow_states = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0),
        shifted_blocks_and_borrow_states, num_radix_blocks * num_many_lut,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
    shifted_blocks = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), shifted_blocks,
        num_radix_blocks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);
    borrow_states = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), borrow_states,
        num_radix_blocks, params.big_lwe_dimension, size_tracker,
        allocate_gpu_memory);

    uint32_t num_luts_first_step = 2 * grouping_size + 1;

    luts_array_first_step = new int_radix_lut<Torus>(
        streams, params, num_luts_first_step, num_radix_blocks, num_many_lut,
        allocate_gpu_memory, size_tracker);

    auto f_shift_block = [message_modulus](Torus block) -> Torus {
      uint64_t overflow_guard = message_modulus;
      uint64_t block_mod = block % message_modulus;
      return (overflow_guard | block_mod) << 1;
    };

    auto f_first_block_state = [message_modulus](Torus block) -> Torus {
      if (block < message_modulus)
        return 1; // Borrows
      else {
        return 0; // Nothing
      }
    };
    std::vector<std::function<Torus(Torus)>> f_first_grouping_luts = {
        f_first_block_state, f_shift_block};

    auto first_block_lut = luts_array_first_step->get_lut(0, 0);
    auto first_block_lut_degrees = luts_array_first_step->get_degree(0);
    auto first_block_lut_max_degree = luts_array_first_step->get_max_degree(0);

    generate_many_lut_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), first_block_lut,
        first_block_lut_degrees, first_block_lut_max_degree, glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts,
        gpu_memory_allocated);

    // luts for other blocks of the first grouping
    for (int lut_id = 1; lut_id < grouping_size; lut_id++) {
      auto f_state = [message_modulus, lut_id](Torus block) -> Torus {
        uint64_t r = 0;
        if (block < message_modulus) {
          r = 2; // Borrows
        } else if (block == message_modulus) {
          r = 1; // Propagates a borrow
        } else {
          r = 0; // Does not borrow
        }
        return r << (lut_id - 1);
      };
      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
          f_state, f_shift_block};
      auto lut = luts_array_first_step->get_lut(0, lut_id);
      auto lut_degrees = luts_array_first_step->get_degree(lut_id);
      auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
      generate_many_lut_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
          carry_modulus, f_grouping_luts, gpu_memory_allocated);
    }

    // luts for the rest of groupings (except for the last block)
    for (int i = 0; i < grouping_size; i++) {
      uint32_t lut_id = i + grouping_size;
      auto f_state = [message_modulus, i](Torus block) -> Torus {
        uint64_t r = 0;
        if (block < message_modulus) {
          r = 2; // Generates borrow
        } else if (block == message_modulus) {
          r = 1; // Propagates a borrow
        } else {
          r = 0; // Does not borrow
        }
        return r << i;
      };
      std::vector<std::function<Torus(Torus)>> f_grouping_luts = {
          f_state, f_shift_block};

      auto lut = luts_array_first_step->get_lut(0, lut_id);
      auto lut_degrees = luts_array_first_step->get_degree(lut_id);
      auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id);
      generate_many_lut_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), lut, lut_degrees,
          lut_max_degree, glwe_dimension, polynomial_size, message_modulus,
          carry_modulus, f_grouping_luts, gpu_memory_allocated);
    }

    auto f_last_block_state = [message_modulus](Torus block) -> Torus {
      if (block < message_modulus)
        return 2 << 1; // Generates a borrow
      else
        return 0; // Nothing
    };

    uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step

    auto last_block_lut = luts_array_first_step->get_lut(0, lut_id);
    auto last_block_lut_degrees = luts_array_first_step->get_degree(lut_id);
    auto last_block_lut_max_degree =
        luts_array_first_step->get_max_degree(lut_id);

    std::vector<std::function<Torus(Torus)>> f_last_grouping_luts = {
        f_last_block_state, f_shift_block};

    generate_many_lut_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0), last_block_lut,
        last_block_lut_degrees, last_block_lut_max_degree, glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts,
        gpu_memory_allocated);

    // Generate the indexes to switch between luts within the pbs
    uint64_t lut_indexes_size = num_radix_blocks * sizeof(Torus);
    Torus *h_lut_indexes = luts_array_first_step->h_lut_indexes;

    for (int index = 0; index < num_radix_blocks; index++) {
      uint32_t grouping_index = index / grouping_size;
      bool is_in_first_grouping = (grouping_index == 0);
      uint32_t index_in_grouping = index % grouping_size;
      bool is_last_index = (index == (num_radix_blocks - 1));
      if (is_last_index) {
        if (num_radix_blocks == 1) {
          h_lut_indexes[index] = 2 * grouping_size;
        } else {
          h_lut_indexes[index] = 2;
        }
      } else if (is_in_first_grouping) {
        h_lut_indexes[index] = index_in_grouping;
      } else {
        h_lut_indexes[index] = index_in_grouping + grouping_size;
      }
    }
    // copy the indexes to the gpu
    Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_to_gpu(
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
    auto active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_first_step->broadcast_lut(active_streams);
  };

  // needed for the division to update the lut indexes
  void update_lut_indexes(CudaStreams streams, Torus *new_lut_indexes,
                          uint32_t new_num_blocks) {
    Torus *lut_indexes = luts_array_first_step->get_lut_indexes(0, 0);
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
    auto new_active_streams = streams.active_gpu_subset(
        new_num_blocks, luts_array_first_step->params.pbs_type);
    // We just need to update the lut indexes so we use false here
    luts_array_first_step->broadcast_lut(new_active_streams, false);
  }
  void release(CudaStreams streams) {

    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   shifted_blocks_and_borrow_states,
                                   gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   shifted_blocks, gpu_memory_allocated);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   borrow_states, gpu_memory_allocated);

    luts_array_first_step->release(streams);
    delete luts_array_first_step;
    delete shifted_blocks_and_borrow_states;
    delete shifted_blocks;
    delete borrow_states;
    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  };
};

template <typename Torus> struct int_borrow_prop_memory {
  uint32_t num_many_lut;
  uint32_t lut_stride;

  uint32_t group_size;
  uint32_t num_groups;
  CudaRadixCiphertextFFI *overflow_block;

  int_radix_lut<Torus> *lut_message_extract;
  int_radix_lut<Torus> *lut_borrow_flag;

  int_shifted_blocks_and_borrow_states_memory<Torus>
      *shifted_blocks_borrow_state_mem;
  int_prop_simu_group_carries_memory<Torus> *prop_simu_group_carries_mem;

  int_radix_params params;

  CudaStreams active_streams;
  InternalCudaStreams internal_streams;

  uint32_t compute_overflow;
  bool gpu_memory_allocated;
  int_borrow_prop_memory(CudaStreams streams, int_radix_params params,
                         uint32_t num_radix_blocks,
                         uint32_t compute_overflow_in, bool allocate_gpu_memory,
                         uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
    auto glwe_dimension = params.glwe_dimension;
    auto polynomial_size = params.polynomial_size;
    auto message_modulus = params.message_modulus;
    auto carry_modulus = params.carry_modulus;
    compute_overflow = compute_overflow_in;
    // for compute shifted blocks and block states
    uint32_t block_modulus = message_modulus * carry_modulus;
    uint32_t num_bits_in_block = std::log2(block_modulus);
    uint32_t grouping_size = num_bits_in_block;
    group_size = grouping_size;
    num_groups = (num_radix_blocks + grouping_size - 1) / grouping_size;

    num_many_lut = 2; // many luts apply 2 luts
    uint32_t box_size = polynomial_size / block_modulus;
    lut_stride = (block_modulus / num_many_lut) * box_size;

    shifted_blocks_borrow_state_mem =
        new int_shifted_blocks_and_borrow_states_memory<Torus>(
            streams, params, num_radix_blocks, num_many_lut, grouping_size,
            allocate_gpu_memory, size_tracker);

    prop_simu_group_carries_mem = new int_prop_simu_group_carries_memory<Torus>(
        streams, params, num_radix_blocks, grouping_size, num_groups,
        allocate_gpu_memory, size_tracker);

    overflow_block = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
        streams.stream(0), streams.gpu_index(0), overflow_block, 1,
        params.big_lwe_dimension, size_tracker, allocate_gpu_memory);

    lut_message_extract =
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);
    // lut for the first block in the first grouping
    auto f_message_extract = [message_modulus](Torus block) -> Torus {
      return (block >> 1) % message_modulus;
    };

    generate_device_accumulator<Torus>(
        streams.stream(0), streams.gpu_index(0),
        lut_message_extract->get_lut(0, 0), lut_message_extract->get_degree(0),
        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_message_extract,
        gpu_memory_allocated);
    active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

    lut_message_extract->broadcast_lut(active_streams);

    if (compute_overflow) {
      lut_borrow_flag =
          new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                   allocate_gpu_memory, size_tracker);
      // lut for the first block in the first grouping
      auto f_borrow_flag = [](Torus block) -> Torus {
        return ((block >> 2) & 1);
      };

      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0),
          lut_borrow_flag->get_lut(0, 0), lut_borrow_flag->get_degree(0),
          lut_borrow_flag->get_max_degree(0), glwe_dimension, polynomial_size,
          message_modulus, carry_modulus, f_borrow_flag, gpu_memory_allocated);
      lut_borrow_flag->broadcast_lut(active_streams);
    }

    active_streams =
        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
                                                               2);
  };

  // needed for the division to update the lut indexes
  void update_lut_indexes(CudaStreams streams, Torus *first_indexes_for_div,
                          Torus *second_indexes_for_div, Torus *scalars_for_div,
                          uint32_t new_num_blocks) {
    shifted_blocks_borrow_state_mem->update_lut_indexes(
        streams, first_indexes_for_div, new_num_blocks);
    prop_simu_group_carries_mem->update_lut_indexes(
        streams, second_indexes_for_div, scalars_for_div, new_num_blocks);
  }
  void release(CudaStreams streams) {

    shifted_blocks_borrow_state_mem->release(streams);
    prop_simu_group_carries_mem->release(streams);
    release_radix_ciphertext_async(streams.stream(0), streams.gpu_index(0),
                                   overflow_block, gpu_memory_allocated);

    lut_message_extract->release(streams);
    delete lut_message_extract;
    delete overflow_block;
    if (compute_overflow) {
      lut_borrow_flag->release(streams);
      delete lut_borrow_flag;
    }

    internal_streams.release(streams);

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
  };
};
std::pair<bool, bool> get_invert_flags(COMPARISON_TYPE compare);
void reverseArray(uint64_t arr[], size_t n);
#endif // CUDA_INTEGER_UTILITIES_H