diff --git a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
index 633e4eb82..fdcb89082 100644
--- a/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
+++ b/.github/workflows/integer_multi_bit_multi_gpu_benchmark.yml
@@ -46,8 +46,8 @@ jobs:
           github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
           slab-url: ${{ secrets.SLAB_BASE_URL }}
           job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: multi-gpu-test
+          backend: hyperstack
+          profile: multi-h100-nvlink
 
   cuda-integer-multi-bit-multi-gpu-benchmarks:
     name: Execute multi GPU integer multi-bit benchmarks
@@ -62,11 +62,23 @@ jobs:
         include:
           - os: ubuntu-22.04
             cuda: "12.2"
-            gcc: 9
+            gcc: 11
     env:
       CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    
+      CMAKE_VERSION: 3.29.6
     steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
       - name: Checkout tfhe-rs repo with tags
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
         with:
@@ -135,7 +147,7 @@ jobs:
         run: |
           python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
           --database tfhe_rs \
-          --hardware "p3.8xlarge" \
+          --hardware "n3-H100x8-nvlink" \
           --backend gpu \
           --project-version "${{ env.COMMIT_HASH }}" \
           --branch ${{ github.ref_name }} \
diff --git a/.github/workflows/integer_multi_gpu_full_benchmark.yml b/.github/workflows/integer_multi_gpu_full_benchmark.yml
index 40d56786e..13c5a0ae7 100644
--- a/.github/workflows/integer_multi_gpu_full_benchmark.yml
+++ b/.github/workflows/integer_multi_gpu_full_benchmark.yml
@@ -35,8 +35,8 @@ jobs:
           github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
           slab-url: ${{ secrets.SLAB_BASE_URL }}
           job-secret: ${{ secrets.JOB_SECRET }}
-          backend: aws
-          profile: multi-gpu-test
+          backend: hyperstack
+          profile: multi-h100-nvlink
 
   cuda-integer-full-multi-gpu-benchmarks:
     name: Execute multi GPU integer benchmarks for all operations flavor
@@ -54,11 +54,23 @@ jobs:
         include:
           - os: ubuntu-22.04
             cuda: "12.2"
-            gcc: 9
+            gcc: 11
     env:
       CUDA_PATH: /usr/local/cuda-${{ matrix.cuda }}
-    
+      CMAKE_VERSION: 3.29.6
     steps:
+      # Mandatory on hyperstack since a bootable volume is not re-usable yet.
+      - name: Install dependencies
+        run: |
+          sudo apt update
+          sudo apt install -y checkinstall zlib1g-dev libssl-dev
+          wget https://github.com/Kitware/CMake/releases/download/v${{ env.CMAKE_VERSION }}/cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          tar -zxvf cmake-${{ env.CMAKE_VERSION }}.tar.gz
+          cd cmake-${{ env.CMAKE_VERSION }}
+          ./bootstrap
+          make -j"$(nproc)"
+          sudo make install
+
       - name: Checkout tfhe-rs repo with tags
         uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332
         with:
@@ -117,7 +129,7 @@ jobs:
         run: |
           python3 ./ci/benchmark_parser.py target/criterion ${{ env.RESULTS_FILENAME }} \
           --database tfhe_rs \
-          --hardware "p3.8xlarge" \
+          --hardware "n3-H100x8-nvlink" \
           --backend gpu \
           --project-version "${{ env.COMMIT_HASH }}" \
           --branch ${{ github.ref_name }} \
diff --git a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
index e928263b9..cbdd17fbf 100644
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -1,6 +1,8 @@
 #ifndef HELPER_MULTI_GPU_H
 #define HELPER_MULTI_GPU_H
 #include <mutex>
+#include <variant>
+#include <vector>
 
 extern std::mutex m;
 extern bool p2p_enabled;
@@ -9,6 +11,20 @@ extern "C" {
 int cuda_setup_multi_gpu();
 }
 
+// Define a variant type that can be either a vector or a single pointer
+template <typename Torus>
+using LweArrayVariant = std::variant<std::vector<Torus *>, Torus *>;
+
+// Macro to define the visitor logic using std::holds_alternative for vectors
+#define GET_VARIANT_ELEMENT(variant, index)                                    \
+  [&] {                                                                        \
+    if (std::holds_alternative<std::vector<Torus *>>(variant)) {               \
+      return std::get<std::vector<Torus *>>(variant)[index];                   \
+    } else {                                                                   \
+      return std::get<Torus *>(variant);                                       \
+    }                                                                          \
+  }()
+
 int get_active_gpu_count(int num_inputs, int gpu_count);
 
 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index a55c3d03e..df4eb39df 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -487,11 +487,21 @@ template <typename Torus> struct int_radix_lut {
   // for the moment
   Torus *lwe_indexes_in;
   Torus *lwe_indexes_out;
+  Torus *h_lwe_indexes_in;
+  Torus *h_lwe_indexes_out;
+  // Enable optimizations if lwe_indexes_(in/out) are trivial
+  bool using_trivial_lwe_indexes = true;
   // lwe_trivial_indexes is the intermediary index we need in case
   // lwe_indexes_in != lwe_indexes_out
   Torus *lwe_trivial_indexes;
   Torus *tmp_lwe_before_ks;
-  Torus *tmp_lwe_after_ks;
+
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec;
+  std::vector<Torus *> lwe_after_pbs_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec;
 
   int_radix_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
                 uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
@@ -511,7 +521,7 @@ template <typename Torus> struct int_radix_lut {
       cudaSetDevice(i);
       int8_t *gpu_pbs_buffer;
       auto num_blocks_on_gpu =
-          get_num_inputs_on_gpu(num_radix_blocks, i, gpu_count);
+          get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
 
       execute_scratch_pbs<Torus>(
           streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
@@ -551,20 +561,43 @@ template <typename Torus> struct int_radix_lut {
           num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
       lwe_trivial_indexes = (Torus *)cuda_malloc_async(
           num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
-      auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+
+      h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+      h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
 
       for (int i = 0; i < num_radix_blocks; i++)
-        h_lwe_indexes[i] = i;
+        h_lwe_indexes_in[i] = i;
 
-      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
-      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+      cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                                num_radix_blocks * sizeof(Torus), streams[0],
                                gpu_indexes[0]);
+      memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+             num_radix_blocks * sizeof(Torus));
+
+      /// With multiple GPUs we allocate arrays to be pushed to the vectors and
+      /// copy data on each GPU then when we gather data to GPU 0 we can copy
+      /// back to the original indexing
+      multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                                lwe_array_in_vec, num_radix_blocks,
+                                params.big_lwe_dimension + 1);
+      multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                                lwe_after_ks_vec, num_radix_blocks,
+                                params.small_lwe_dimension + 1);
+      multi_gpu_alloc_lwe_async(streams, gpu_indexes, active_gpu_count,
+                                lwe_after_pbs_vec, num_radix_blocks,
+                                params.big_lwe_dimension + 1);
+      multi_gpu_alloc_array_async(streams, gpu_indexes, active_gpu_count,
+                                  lwe_trivial_indexes_vec, num_radix_blocks);
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+      multi_gpu_copy_array_async(streams, gpu_indexes, active_gpu_count,
+                                 lwe_trivial_indexes_vec, lwe_trivial_indexes,
+                                 num_radix_blocks);
 
       // Keyswitch
       Torus big_size =
@@ -573,10 +606,6 @@ template <typename Torus> struct int_radix_lut {
           (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
       tmp_lwe_before_ks =
           (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
-      tmp_lwe_after_ks =
-          (Torus *)cuda_malloc_async(small_size, streams[0], gpu_indexes[0]);
-      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-      free(h_lwe_indexes);
     }
   }
 
@@ -598,7 +627,14 @@ template <typename Torus> struct int_radix_lut {
     buffer = base_lut_object->buffer;
     // Keyswitch
     tmp_lwe_before_ks = base_lut_object->tmp_lwe_before_ks;
-    tmp_lwe_after_ks = base_lut_object->tmp_lwe_after_ks;
+
+    /// With multiple GPUs we allocate arrays to be pushed to the vectors and
+    /// copy data on each GPU then when we gather data to GPU 0 we can copy back
+    /// to the original indexing
+    lwe_array_in_vec = base_lut_object->lwe_array_in_vec;
+    lwe_after_ks_vec = base_lut_object->lwe_after_ks_vec;
+    lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec;
+    lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;
 
     mem_reuse = true;
 
@@ -630,22 +666,24 @@ template <typename Torus> struct int_radix_lut {
         num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
     lwe_trivial_indexes = (Torus *)cuda_malloc_async(
         num_radix_blocks * sizeof(Torus), streams[0], gpu_indexes[0]);
-    auto h_lwe_indexes = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+
+    h_lwe_indexes_in = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
+    h_lwe_indexes_out = (Torus *)malloc(num_radix_blocks * sizeof(Torus));
 
     for (int i = 0; i < num_radix_blocks; i++)
-      h_lwe_indexes[i] = i;
+      h_lwe_indexes_in[i] = i;
 
-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes,
+    cuda_memcpy_async_to_gpu(lwe_trivial_indexes, h_lwe_indexes_in,
                              num_radix_blocks * sizeof(Torus), streams[0],
                              gpu_indexes[0]);
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-    free(h_lwe_indexes);
+    memcpy(h_lwe_indexes_out, h_lwe_indexes_in,
+           num_radix_blocks * sizeof(Torus));
   }
 
   // Return a pointer to idx-ith lut at gpu_index's global memory
@@ -663,6 +701,22 @@ template <typename Torus> struct int_radix_lut {
     return &lut_indexes[ind];
   }
 
+  // If this function is called we assume the lwe_indexes_(in/out) are not the
+  // trivial anymore and thus we disable optimizations
+  void set_lwe_indexes(cudaStream_t stream, uint32_t gpu_index,
+                       Torus *h_indexes_in, Torus *h_indexes_out) {
+
+    memcpy(h_lwe_indexes_in, h_indexes_in, num_blocks * sizeof(Torus));
+    memcpy(h_lwe_indexes_out, h_indexes_out, num_blocks * sizeof(Torus));
+
+    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_indexes_in,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_indexes_out,
+                             num_blocks * sizeof(Torus), stream, gpu_index);
+
+    using_trivial_lwe_indexes = false;
+  }
+
   // Broadcast luts from gpu src_gpu_idx to all active gpus
   void broadcast_lut(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t src_gpu_idx) {
@@ -672,7 +726,6 @@ template <typename Torus> struct int_radix_lut {
     auto src_lut_indexes = lut_indexes_vec[src_gpu_idx];
 
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-#pragma omp parallel for num_threads(active_gpu_count)
     for (uint i = 0; i < active_gpu_count; i++) {
       if (i != src_gpu_idx) {
         auto dst_lut = lut_vec[i];
@@ -690,7 +743,6 @@ template <typename Torus> struct int_radix_lut {
 
   void release(cudaStream_t *streams, uint32_t *gpu_indexes,
                uint32_t gpu_count) {
-#pragma omp parallel for num_threads(active_gpu_count)
     for (uint i = 0; i < active_gpu_count; i++) {
       cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
       cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);
@@ -701,9 +753,13 @@ template <typename Torus> struct int_radix_lut {
     cuda_drop_async(lwe_indexes_in, streams[0], gpu_indexes[0]);
     cuda_drop_async(lwe_indexes_out, streams[0], gpu_indexes[0]);
     cuda_drop_async(lwe_trivial_indexes, streams[0], gpu_indexes[0]);
+
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    free(h_lwe_indexes_in);
+    free(h_lwe_indexes_out);
+
     if (!mem_reuse) {
       cuda_drop_async(tmp_lwe_before_ks, streams[0], gpu_indexes[0]);
-      cuda_drop_async(tmp_lwe_after_ks, streams[0], gpu_indexes[0]);
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
       for (int i = 0; i < buffer.size(); i++) {
         switch (params.pbs_type) {
@@ -721,6 +777,17 @@ template <typename Torus> struct int_radix_lut {
         cuda_synchronize_stream(streams[i], gpu_indexes[i]);
       }
       buffer.clear();
+
+      multi_gpu_release_async(streams, gpu_indexes, lwe_array_in_vec);
+      multi_gpu_release_async(streams, gpu_indexes, lwe_after_ks_vec);
+      multi_gpu_release_async(streams, gpu_indexes, lwe_after_pbs_vec);
+      multi_gpu_release_async(streams, gpu_indexes, lwe_trivial_indexes_vec);
+      for (uint i = 0; i < active_gpu_count; i++)
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      lwe_array_in_vec.clear();
+      lwe_after_ks_vec.clear();
+      lwe_after_pbs_vec.clear();
+      lwe_trivial_indexes_vec.clear();
     }
   }
 };
@@ -782,10 +849,6 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
         for (int i = 0; i < bits_per_block; i++)
           h_lwe_indexes_in[i + j * bits_per_block] = j;
       }
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_in, h_lwe_indexes_in,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
 
       /**
        * the output should aim different lwe ciphertexts, so lwe_indexes_out =
@@ -797,10 +860,9 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
       for (int i = 0; i < num_radix_blocks * bits_per_block; i++)
         h_lwe_indexes_out[i] = i;
 
-      cuda_memcpy_async_to_gpu(lut->lwe_indexes_out, h_lwe_indexes_out,
-                               num_radix_blocks * bits_per_block *
-                                   sizeof(Torus),
-                               streams[0], gpu_indexes[0]);
+      lut->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_indexes_in,
+                           h_lwe_indexes_out);
+
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
       free(h_lut_indexes);
       free(h_lwe_indexes_in);
@@ -1666,6 +1728,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
 
   cudaStream_t *local_streams_1;
   cudaStream_t *local_streams_2;
+  uint32_t active_gpu_count;
 
   int_arithmetic_scalar_shift_buffer(cudaStream_t *streams,
                                      uint32_t *gpu_indexes, uint32_t gpu_count,
@@ -1673,12 +1736,15 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
                                      int_radix_params params,
                                      uint32_t num_radix_blocks,
                                      bool allocate_gpu_memory) {
+    active_gpu_count = get_active_gpu_count(1, gpu_count);
     // In the arithmetic shift, a PBS has to be applied to the last rotated
     // block twice: once to shift it, once to compute the padding block to be
     // copied onto all blocks to the left of the last rotated block
-    local_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    local_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < gpu_count; j++) {
+    local_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    local_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
       local_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
       local_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
     }
@@ -1811,7 +1877,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
 
   void release(cudaStream_t *streams, uint32_t *gpu_indexes,
                uint32_t gpu_count) {
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(local_streams_1[j], gpu_indexes[j]);
       cuda_destroy_stream(local_streams_2[j], gpu_indexes[j]);
     }
@@ -1840,20 +1906,24 @@ template <typename Torus> struct int_zero_out_if_buffer {
 
   cudaStream_t *true_streams;
   cudaStream_t *false_streams;
+  uint32_t active_gpu_count;
 
   int_zero_out_if_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
                          uint32_t gpu_count, int_radix_params params,
                          uint32_t num_radix_blocks, bool allocate_gpu_memory) {
     this->params = params;
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
 
     Torus big_size =
         (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
     if (allocate_gpu_memory) {
       tmp = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]);
       // We may use a different stream to allow concurrent operation
-      true_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      false_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      for (uint j = 0; j < gpu_count; j++) {
+      true_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      false_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      for (uint j = 0; j < active_gpu_count; j++) {
         true_streams[j] = cuda_create_stream(gpu_indexes[j]);
         false_streams[j] = cuda_create_stream(gpu_indexes[j]);
       }
@@ -1862,7 +1932,7 @@ template <typename Torus> struct int_zero_out_if_buffer {
   void release(cudaStream_t *streams, uint32_t *gpu_indexes,
                uint32_t gpu_count) {
     cuda_drop_async(tmp, streams[0], gpu_indexes[0]);
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(true_streams[j], gpu_indexes[j]);
       cuda_destroy_stream(false_streams[j], gpu_indexes[j]);
     }
@@ -2295,6 +2365,7 @@ template <typename Torus> struct int_comparison_buffer {
   int_radix_lut<Torus> *signed_msb_lut;
   cudaStream_t *lsb_streams;
   cudaStream_t *msb_streams;
+  uint32_t active_gpu_count;
 
   int_comparison_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
                         uint32_t gpu_count, COMPARISON_TYPE op,
@@ -2304,14 +2375,18 @@ template <typename Torus> struct int_comparison_buffer {
     this->op = op;
     this->is_signed = is_signed;
 
+    active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
+
     identity_lut_f = [](Torus x) -> Torus { return x; };
 
     auto big_lwe_size = params.big_lwe_dimension + 1;
 
     if (allocate_gpu_memory) {
-      lsb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      msb_streams = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-      for (uint j = 0; j < gpu_count; j++) {
+      lsb_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      msb_streams =
+          (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+      for (uint j = 0; j < active_gpu_count; j++) {
         lsb_streams[j] = cuda_create_stream(gpu_indexes[j]);
         msb_streams[j] = cuda_create_stream(gpu_indexes[j]);
       }
@@ -2475,7 +2550,7 @@ template <typename Torus> struct int_comparison_buffer {
       signed_msb_lut->release(streams, gpu_indexes, gpu_count);
       delete (signed_msb_lut);
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < active_gpu_count; j++) {
       cuda_destroy_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_destroy_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -2486,6 +2561,7 @@ template <typename Torus> struct int_comparison_buffer {
 
 template <typename Torus> struct int_div_rem_memory {
   int_radix_params params;
+  uint32_t active_gpu_count;
 
   // memory objects for other operations
   int_logical_scalar_shift_buffer<Torus> *shift_mem_1;
@@ -2721,6 +2797,8 @@ template <typename Torus> struct int_div_rem_memory {
   int_div_rem_memory(cudaStream_t *streams, uint32_t *gpu_indexes,
                      uint32_t gpu_count, int_radix_params params,
                      uint32_t num_blocks, bool allocate_gpu_memory) {
+    active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
+
     this->params = params;
     shift_mem_1 = new int_logical_scalar_shift_buffer<Torus>(
         streams, gpu_indexes, gpu_count, SHIFT_OR_ROTATE_TYPE::LEFT_SHIFT,
@@ -2740,11 +2818,15 @@ template <typename Torus> struct int_div_rem_memory {
     init_lookup_tables(streams, gpu_indexes, gpu_count, num_blocks);
     init_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
 
-    sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_3 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_4 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < gpu_count; j++) {
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_3 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_4 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
       sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
       sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
       sub_streams_3[j] = cuda_create_stream(gpu_indexes[j]);
@@ -2815,7 +2897,7 @@ template <typename Torus> struct int_div_rem_memory {
     delete[] merge_overflow_flags_luts;
 
     // release sub streams
-    for (uint i = 0; i < gpu_count; i++) {
+    for (uint i = 0; i < active_gpu_count; i++) {
       cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
       cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
       cuda_destroy_stream(sub_streams_3[i], gpu_indexes[i]);
@@ -2992,12 +3074,12 @@ template <typename Torus> struct int_resolve_signed_overflow_memory {
 
 template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
   int_radix_params params;
+  uint32_t active_gpu_count;
 
   // memory objects for other operations
   int_sc_prop_memory<Torus> *scp_mem;
   int_last_block_inner_propagate_memory<Torus> *las_block_prop_mem;
   int_resolve_signed_overflow_memory<Torus> *resolve_overflow_mem;
-  // lookupt tables
 
   // sub streams
   cudaStream_t *sub_streams_1;
@@ -3036,13 +3118,17 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
       int_radix_params params, uint32_t num_blocks, SIGNED_OPERATION op,
       bool allocate_gpu_memory) {
     this->params = params;
+    active_gpu_count = get_active_gpu_count(num_blocks, gpu_count);
 
-    allocate_temporary_buffers(streams, gpu_indexes, gpu_count, num_blocks);
+    allocate_temporary_buffers(streams, gpu_indexes, active_gpu_count,
+                               num_blocks);
 
     // initialize streams
-    sub_streams_1 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    sub_streams_2 = (cudaStream_t *)malloc(gpu_count * sizeof(cudaStream_t));
-    for (uint j = 0; j < gpu_count; j++) {
+    sub_streams_1 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    sub_streams_2 =
+        (cudaStream_t *)malloc(active_gpu_count * sizeof(cudaStream_t));
+    for (uint j = 0; j < active_gpu_count; j++) {
       sub_streams_1[j] = cuda_create_stream(gpu_indexes[j]);
       sub_streams_2[j] = cuda_create_stream(gpu_indexes[j]);
     }
@@ -3078,7 +3164,7 @@ template <typename Torus> struct int_signed_overflowing_add_or_sub_memory {
     cuda_drop_async(last_block_inner_propagation, streams[0], gpu_indexes[0]);
 
     // sub streams
-    for (uint i = 0; i < gpu_count; i++) {
+    for (uint i = 0; i < active_gpu_count; i++) {
       cuda_destroy_stream(sub_streams_1[i], gpu_indexes[i]);
       cuda_destroy_stream(sub_streams_2[i], gpu_indexes[i]);
     }
diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
index 70297f50f..924ec131f 100644
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -9,15 +9,13 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset = 0);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 
 void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset = 0);
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples);
 }
 
 #endif // CNCRT_KS_H_
diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
index 260c037b6..0980ddee4 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -51,7 +51,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t max_shared_memory);
 
 void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
                                                    uint32_t gpu_index,
@@ -76,7 +76,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t max_shared_memory);
 
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -85,7 +85,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t max_shared_memory);
 
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                          int8_t **pbs_buffer);
@@ -354,7 +354,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 
 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -364,7 +364,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 
 #if (CUDA_ARCH >= 900)
 template <typename Torus>
@@ -375,7 +375,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset = 0);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 
 template <typename Torus, typename STorus>
 void scratch_cuda_programmable_bootstrap_tbc(
diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
index f0f9efabd..a80035173 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -29,8 +29,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
 
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                    uint32_t gpu_index,
@@ -65,7 +64,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+    uint32_t lwe_chunk_size);
 #endif
 
 template <typename Torus, typename STorus>
@@ -92,7 +91,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size = 0);
 
 template <typename Torus, typename STorus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
@@ -111,7 +110,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size = 0);
 
 template <typename Torus>
 __host__ __device__ uint64_t
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
index 2373e418a..524a1fa45 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -9,16 +9,14 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
   cuda_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
       static_cast<uint32_t *>(lwe_array_in),
       static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
-      gpu_offset);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
 
 /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -41,14 +39,12 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
   cuda_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_output_indexes),
       static_cast<uint64_t *>(lwe_array_in),
       static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
-      gpu_offset);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
index a1bc861f7..db78104a9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -43,23 +43,20 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
           const Torus *__restrict__ lwe_array_in,
           const Torus *__restrict__ lwe_input_indexes,
           const Torus *__restrict__ ksk, uint32_t lwe_dimension_in,
-          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-          int gpu_offset) {
+          uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count) {
   const int tid = threadIdx.x + blockIdx.x * blockDim.x;
   const int shmem_index = threadIdx.x + threadIdx.y * blockDim.x;
 
   extern __shared__ int8_t sharedmem[];
   Torus *lwe_acc_out = (Torus *)sharedmem;
-  auto block_lwe_array_out =
-      get_chunk(lwe_array_out, lwe_output_indexes[blockIdx.y + gpu_offset],
-                lwe_dimension_out + 1);
+  auto block_lwe_array_out = get_chunk(
+      lwe_array_out, lwe_output_indexes[blockIdx.y], lwe_dimension_out + 1);
 
   if (tid <= lwe_dimension_out) {
 
     Torus local_lwe_out = 0;
-    auto block_lwe_array_in =
-        get_chunk(lwe_array_in, lwe_input_indexes[blockIdx.y + gpu_offset],
-                  lwe_dimension_in + 1);
+    auto block_lwe_array_in = get_chunk(
+        lwe_array_in, lwe_input_indexes[blockIdx.y], lwe_dimension_in + 1);
 
     if (tid == lwe_dimension_out && threadIdx.y == 0) {
       local_lwe_out = block_lwe_array_in[lwe_dimension_in];
@@ -105,8 +102,7 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
     Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t gpu_offset = 0) {
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
 
   cudaSetDevice(gpu_index);
 
@@ -122,42 +118,40 @@ __host__ void cuda_keyswitch_lwe_ciphertext_vector(
 
   keyswitch<Torus><<<grid, threads, shared_mem, stream>>>(
       lwe_array_out, lwe_output_indexes, lwe_array_in, lwe_input_indexes, ksk,
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, gpu_offset);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count);
   check_cuda_error(cudaGetLastError());
 }
 
 template <typename Torus>
-void execute_keyswitch(cudaStream_t *streams, uint32_t *gpu_indexes,
-                       uint32_t gpu_count, Torus *lwe_array_out,
-                       Torus *lwe_output_indexes, Torus *lwe_array_in,
-                       Torus *lwe_input_indexes, Torus **ksks,
-                       uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-                       uint32_t base_log, uint32_t level_count,
-                       uint32_t num_samples, bool sync_streams = true) {
+void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             uint32_t gpu_count,
+                             const LweArrayVariant<Torus> &lwe_array_out,
+                             const LweArrayVariant<Torus> &lwe_output_indexes,
+                             const LweArrayVariant<Torus> &lwe_array_in,
+                             const LweArrayVariant<Torus> &lwe_input_indexes,
+                             Torus **ksks, uint32_t lwe_dimension_in,
+                             uint32_t lwe_dimension_out, uint32_t base_log,
+                             uint32_t level_count, uint32_t num_samples) {
 
   /// If the number of radix blocks is lower than the number of GPUs, not all
   /// GPUs will be active and there will be 1 input per GPU
-  auto active_gpu_count = get_active_gpu_count(num_samples, gpu_count);
-  int num_samples_on_gpu_0 = get_num_inputs_on_gpu(num_samples, 0, gpu_count);
-  if (sync_streams)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-#pragma omp parallel for num_threads(active_gpu_count)
-  for (uint i = 0; i < active_gpu_count; i++) {
+  for (uint i = 0; i < gpu_count; i++) {
     int num_samples_on_gpu = get_num_inputs_on_gpu(num_samples, i, gpu_count);
-    int gpu_offset = get_gpu_offset(num_samples, i, gpu_count);
+
+    Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+    Torus *current_lwe_output_indexes =
+        GET_VARIANT_ELEMENT(lwe_output_indexes, i);
+    Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+    Torus *current_lwe_input_indexes =
+        GET_VARIANT_ELEMENT(lwe_input_indexes, i);
 
     // Compute Keyswitch
     cuda_keyswitch_lwe_ciphertext_vector<Torus>(
-        streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
-        lwe_array_in, lwe_input_indexes, ksks[i], lwe_dimension_in,
-        lwe_dimension_out, base_log, level_count, num_samples_on_gpu,
-        gpu_offset);
+        streams[i], gpu_indexes[i], current_lwe_array_out,
+        current_lwe_output_indexes, current_lwe_array_in,
+        current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
+        base_log, level_count, num_samples_on_gpu);
   }
-
-  if (sync_streams)
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    }
 }
 
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
index bab675bf5..28b02a57f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh
@@ -131,7 +131,7 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb(
     }
   }
 
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
     cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
   }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
index 43fe3b184..a1057a443 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -76,8 +76,10 @@ __host__ void host_integer_radix_cmux_kb(
                   mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks);
     }
   }
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(true_streams[j], gpu_indexes[j]);
+  }
+  for (uint j = 0; j < mem_ptr->zero_if_false_buffer->active_gpu_count; j++) {
     cuda_synchronize_stream(false_streams[j], gpu_indexes[j]);
   }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
index e1d3e8452..d56d6e329 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -245,7 +245,6 @@ __host__ void host_compare_with_zero_equality(
     int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
     int32_t num_radix_blocks, int_radix_lut<Torus> *zero_comparison) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto message_modulus = params.message_modulus;
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
index 9562fdbf1..f38c59167 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -31,17 +31,13 @@ template <typename Torus> struct lwe_ciphertext_list {
   int_radix_params params;
 
   size_t big_lwe_size;
-  size_t radix_size;
   size_t big_lwe_size_bytes;
-  size_t radix_size_bytes;
   size_t big_lwe_dimension;
 
   lwe_ciphertext_list(Torus *src, int_radix_params params, size_t max_blocks)
       : data(src), params(params), max_blocks(max_blocks) {
     big_lwe_size = params.big_lwe_dimension + 1;
     big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
-    radix_size = max_blocks * big_lwe_size;
-    radix_size_bytes = radix_size * sizeof(Torus);
     big_lwe_dimension = params.big_lwe_dimension;
     len = max_blocks;
   }
@@ -404,7 +400,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                                           gpu_count);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -514,7 +510,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
                                                  gpu_indexes, gpu_count);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -591,7 +587,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
         set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem_ptr->sub_streams_3[j], gpu_indexes[j]);
@@ -632,7 +628,7 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes,
           bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2);
     }
   }
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]);
     cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]);
   }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 7e216ccff..b0e8d4e7d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -3,6 +3,7 @@
 
 #include "crypto/keyswitch.cuh"
 #include "device.h"
+#include "helper_multi_gpu.h"
 #include "integer.h"
 #include "integer/scalar_addition.cuh"
 #include "linear_algebra.h"
@@ -10,6 +11,7 @@
 #include "polynomial/functions.cuh"
 #include "programmable_bootstrap.h"
 #include "utils/helper.cuh"
+#include "utils/helper_multi_gpu.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <functional>
 
@@ -153,28 +155,69 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
   auto polynomial_size = params.polynomial_size;
   auto grouping_factor = params.grouping_factor;
 
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
-  /// Apply KS to go from a big LWE dimension to a small LWE dimension
-  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
-                           lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
-                           lwe_array_in, lut->lwe_indexes_in, ksks,
-                           big_lwe_dimension, small_lwe_dimension, ks_base_log,
-                           ks_level, num_radix_blocks, false);
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
 
-  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-  /// dimension to a big LWE dimension
-  execute_pbs<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
-      lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
-      lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
-      small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-      grouping_factor, num_radix_blocks, 1, 0,
-      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
-
-  /// Synchronize all GPUs
   auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  if (active_gpu_count == 1) {
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+                                   lwe_trivial_indexes_vec[0], lwe_array_in,
+                                   lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                                   small_lwe_dimension, ks_base_log, ks_level,
+                                   num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
+        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, num_radix_blocks, 1, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+  } else {
+    /// Make sure all data that should be on GPU 0 is indeed there
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+    /// With multiple GPUs we push to the vectors on each GPU then when we
+    /// gather data to GPU 0 we can copy back to the original indexing
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, lwe_array_in,
+        lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes, num_radix_blocks,
+        big_lwe_dimension + 1);
+
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
+                                   lwe_array_in_vec, lwe_trivial_indexes_vec,
+                                   ksks, big_lwe_dimension, small_lwe_dimension,
+                                   ks_base_log, ks_level, num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, 1, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes,
+                                      num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
   }
 }
 
@@ -205,29 +248,65 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
                         num_radix_blocks);
   check_cuda_error(cudaGetLastError());
 
-  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  /// For multi GPU execution we create vectors of pointers for inputs and
+  /// outputs
+  std::vector<Torus *> lwe_array_in_vec = lut->lwe_array_in_vec;
+  std::vector<Torus *> lwe_after_ks_vec = lut->lwe_after_ks_vec;
+  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
+  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;
 
-  /// Apply KS to go from a big LWE dimension to a small LWE dimension
-  execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count,
-                           lut->tmp_lwe_after_ks, lut->lwe_trivial_indexes,
-                           lwe_array_pbs_in, lut->lwe_indexes_in, ksks,
-                           big_lwe_dimension, small_lwe_dimension, ks_base_log,
-                           ks_level, num_radix_blocks, false);
-
-  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-  /// dimension to a big LWE dimension
-  execute_pbs<Torus>(
-      streams, gpu_indexes, gpu_count, lwe_array_out, lut->lwe_indexes_out,
-      lut->lut_vec, lut->lut_indexes_vec, lut->tmp_lwe_after_ks,
-      lut->lwe_trivial_indexes, bsks, lut->buffer, glwe_dimension,
-      small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-      grouping_factor, num_radix_blocks, 1, 0,
-      cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type, false);
-
-  /// Synchronize all GPUs
   auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
-  for (uint i = 0; i < active_gpu_count; i++) {
-    cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+  if (active_gpu_count == 1) {
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, 1, lwe_after_ks_vec[0],
+                                   lwe_trivial_indexes_vec[0], lwe_array_pbs_in,
+                                   lut->lwe_indexes_in, ksks, big_lwe_dimension,
+                                   small_lwe_dimension, ks_base_log, ks_level,
+                                   num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, 1, lwe_array_out, lut->lwe_indexes_out,
+        lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
+        lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
+        small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
+        grouping_factor, num_radix_blocks, 1, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+  } else {
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    multi_gpu_scatter_lwe_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_array_in_vec,
+        lwe_array_pbs_in, lut->h_lwe_indexes_in, lut->using_trivial_lwe_indexes,
+        num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Apply KS to go from a big LWE dimension to a small LWE dimension
+    execute_keyswitch_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                   lwe_after_ks_vec, lwe_trivial_indexes_vec,
+                                   lwe_array_in_vec, lwe_trivial_indexes_vec,
+                                   ksks, big_lwe_dimension, small_lwe_dimension,
+                                   ks_base_log, ks_level, num_radix_blocks);
+
+    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+    /// dimension to a big LWE dimension
+    execute_pbs_async<Torus>(
+        streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+        lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
+        lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
+        glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
+        pbs_level, grouping_factor, num_radix_blocks, 1, 0,
+        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+
+    /// Copy data back to GPU 0 and release vecs
+    multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
+                                      lwe_array_out, lwe_after_pbs_vec,
+                                      lut->h_lwe_indexes_out,
+                                      lut->using_trivial_lwe_indexes,
+                                      num_radix_blocks, big_lwe_dimension + 1);
+
+    /// Synchronize all GPUs
+    for (uint i = 0; i < active_gpu_count; i++) {
+      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+    }
   }
 }
 
@@ -325,7 +404,6 @@ void generate_device_accumulator_bivariate(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
     uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f) {
 
-  cudaSetDevice(gpu_index);
   // host lut
   Torus *h_lut =
       (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -357,7 +435,6 @@ void generate_device_accumulator_bivariate_with_factor(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus,
     uint32_t carry_modulus, std::function<Torus(Torus, Torus)> f, int factor) {
 
-  cudaSetDevice(gpu_index);
   // host lut
   Torus *h_lut =
       (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus));
@@ -595,10 +672,10 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
 
     cudaSetDevice(gpu_indexes[0]);
     /// Since the keyswitch is done on one input only, use only 1 GPU
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
-        streams[0], gpu_indexes[0], mem_ptr->tmp_small_lwe_vector,
+    execute_keyswitch_async<Torus>(
+        streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
         mem_ptr->lut->lwe_trivial_indexes, cur_input_block,
-        mem_ptr->lut->lwe_trivial_indexes, ksks[0], params.big_lwe_dimension,
+        mem_ptr->lut->lwe_trivial_indexes, ksks, params.big_lwe_dimension,
         params.small_lwe_dimension, params.ks_base_log, params.ks_level, 1);
 
     cuda_memcpy_async_gpu_to_gpu(&mem_ptr->tmp_small_lwe_vector[small_lwe_size],
@@ -606,7 +683,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
                                  small_lwe_size * sizeof(Torus), streams[0],
                                  gpu_indexes[0]);
 
-    execute_pbs<Torus>(
+    execute_pbs_async<Torus>(
         streams, gpu_indexes, 1, mem_ptr->tmp_big_lwe_vector,
         mem_ptr->lut->lwe_trivial_indexes, mem_ptr->lut->lut_vec,
         mem_ptr->lut->lut_indexes_vec, mem_ptr->tmp_small_lwe_vector,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 39465f430..6aa721b08 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -8,11 +8,13 @@
 
 #include "crypto/keyswitch.cuh"
 #include "device.h"
+#include "helper_multi_gpu.h"
 #include "integer.h"
 #include "integer/integer.cuh"
 #include "linear_algebra.h"
 #include "programmable_bootstrap.h"
 #include "utils/helper.cuh"
+#include "utils/helper_multi_gpu.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <fstream>
 #include <iostream>
@@ -224,11 +226,12 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
   auto message_modulus = mem_ptr->params.message_modulus;
   auto carry_modulus = mem_ptr->params.carry_modulus;
   auto num_blocks = num_blocks_in_radix;
-  auto big_lwe_size = mem_ptr->params.big_lwe_dimension + 1;
+  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto big_lwe_size = big_lwe_dimension + 1;
   auto glwe_dimension = mem_ptr->params.glwe_dimension;
   auto polynomial_size = mem_ptr->params.polynomial_size;
-  auto lwe_dimension = mem_ptr->params.small_lwe_dimension;
-  auto big_lwe_dimension = mem_ptr->params.big_lwe_dimension;
+  auto small_lwe_dimension = mem_ptr->params.small_lwe_dimension;
+  auto small_lwe_size = small_lwe_dimension + 1;
 
   if (old_blocks != terms) {
     cuda_memcpy_async_gpu_to_gpu(old_blocks, terms,
@@ -313,12 +316,10 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
 
     auto lwe_indexes_in = luts_message_carry->lwe_indexes_in;
     auto lwe_indexes_out = luts_message_carry->lwe_indexes_out;
+    luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
+                                        h_lwe_idx_in, h_lwe_idx_out);
 
     size_t copy_size = total_count * sizeof(Torus);
-    cuda_memcpy_async_to_gpu(lwe_indexes_in, h_lwe_idx_in, copy_size,
-                             streams[0], gpu_indexes[0]);
-    cuda_memcpy_async_to_gpu(lwe_indexes_out, h_lwe_idx_out, copy_size,
-                             streams[0], gpu_indexes[0]);
     copy_size = sm_copy_count * sizeof(int32_t);
     cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
                              streams[0], gpu_indexes[0]);
@@ -338,27 +339,97 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
 
     luts_message_carry->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
 
-    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
-    /// Apply KS to go from a big LWE dimension to a small LWE dimension
-    /// After this keyswitch execution, we need to synchronize the streams
-    /// because the keyswitch and PBS do not operate on the same number of
-    /// inputs
-    execute_keyswitch<Torus>(streams, gpu_indexes, gpu_count, small_lwe_vector,
-                             lwe_indexes_in, new_blocks, lwe_indexes_in, ksks,
-                             polynomial_size * glwe_dimension, lwe_dimension,
-                             mem_ptr->params.ks_base_log,
-                             mem_ptr->params.ks_level, message_count, true);
+    /// For multi GPU execution we create vectors of pointers for inputs and
+    /// outputs
+    std::vector<Torus *> new_blocks_vec = luts_message_carry->lwe_array_in_vec;
+    std::vector<Torus *> small_lwe_vector_vec =
+        luts_message_carry->lwe_after_ks_vec;
+    std::vector<Torus *> lwe_after_pbs_vec =
+        luts_message_carry->lwe_after_pbs_vec;
+    std::vector<Torus *> lwe_trivial_indexes_vec =
+        luts_message_carry->lwe_trivial_indexes_vec;
 
-    /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
-    /// dimension to a big LWE dimension
-    execute_pbs<Torus>(streams, gpu_indexes, gpu_count, new_blocks,
-                       lwe_indexes_out, luts_message_carry->lut_vec,
-                       luts_message_carry->lut_indexes_vec, small_lwe_vector,
-                       lwe_indexes_in, bsks, luts_message_carry->buffer,
-                       glwe_dimension, lwe_dimension, polynomial_size,
-                       mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-                       mem_ptr->params.grouping_factor, total_count, 2, 0,
-                       max_shared_memory, mem_ptr->params.pbs_type, true);
+    auto active_gpu_count = get_active_gpu_count(total_count, gpu_count);
+    if (active_gpu_count == 1) {
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, 1, small_lwe_vector, lwe_indexes_in, new_blocks,
+          lwe_indexes_in, ksks, polynomial_size * glwe_dimension,
+          small_lwe_dimension, mem_ptr->params.ks_base_log,
+          mem_ptr->params.ks_level, message_count);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, 1, new_blocks, lwe_indexes_out,
+          luts_message_carry->lut_vec, luts_message_carry->lut_indexes_vec,
+          small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
+          glwe_dimension, small_lwe_dimension, polynomial_size,
+          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+          mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
+          mem_ptr->params.pbs_type);
+    } else {
+      cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, new_blocks_vec, new_blocks,
+          luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          big_lwe_size);
+
+      /// Apply KS to go from a big LWE dimension to a small LWE dimension
+      /// After this keyswitch execution, we need to synchronize the streams
+      /// because the keyswitch and PBS do not operate on the same number of
+      /// inputs
+      execute_keyswitch_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, new_blocks_vec, lwe_trivial_indexes_vec,
+          ksks, big_lwe_dimension, small_lwe_dimension,
+          mem_ptr->params.ks_base_log, mem_ptr->params.ks_level, total_count);
+
+      /// Copy data back to GPU 0, rebuild the lwe array, and scatter again on a
+      /// different configuration
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector,
+          small_lwe_vector_vec, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, message_count,
+          small_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+
+      multi_gpu_scatter_lwe_async<Torus>(
+          streams, gpu_indexes, gpu_count, small_lwe_vector_vec,
+          small_lwe_vector, luts_message_carry->h_lwe_indexes_in,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          small_lwe_size);
+
+      /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+      /// dimension to a big LWE dimension
+      execute_pbs_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec,
+          lwe_trivial_indexes_vec, luts_message_carry->lut_vec,
+          luts_message_carry->lut_indexes_vec, small_lwe_vector_vec,
+          lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
+          glwe_dimension, small_lwe_dimension, polynomial_size,
+          mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
+          mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
+          mem_ptr->params.pbs_type);
+
+      multi_gpu_gather_lwe_async<Torus>(
+          streams, gpu_indexes, active_gpu_count, new_blocks, lwe_after_pbs_vec,
+          luts_message_carry->h_lwe_indexes_out,
+          luts_message_carry->using_trivial_lwe_indexes, total_count,
+          big_lwe_size);
+      /// Synchronize all GPUs
+      for (uint i = 0; i < active_gpu_count; i++) {
+        cuda_synchronize_stream(streams[i], gpu_indexes[i]);
+      }
+    }
 
     int rem_blocks = (r > chunk_size) ? r % chunk_size * num_blocks : 0;
     int new_blocks_created = 2 * ch_amount * num_blocks;
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
index 3aba53cce..a8b688239 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -133,7 +133,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb(
             mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -205,7 +205,6 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
     std::function<Torus(Torus)> sign_handler_f, void **bsks, Torus **ksks,
     uint32_t total_num_radix_blocks, uint32_t total_num_scalar_blocks) {
 
-  cudaSetDevice(gpu_indexes[0]);
   auto params = mem_ptr->params;
   auto big_lwe_dimension = params.big_lwe_dimension;
   auto glwe_dimension = params.glwe_dimension;
@@ -397,7 +396,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
             signed_msb_lut->params.message_modulus);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -465,7 +464,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb(
             mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus);
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
       cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
       cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
     }
@@ -737,7 +736,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb(
     }
   }
 
-  for (uint j = 0; j < gpu_count; j++) {
+  for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
     cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]);
     cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]);
   }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
index cfc9a6773..e612c9ab2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -245,7 +245,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
         }
       }
     }
-    for (uint j = 0; j < gpu_count; j++) {
+    for (uint j = 0; j < mem->active_gpu_count; j++) {
       cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
       cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
     }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index 4b5a2f1f3..cc5bd54fa 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -1,14 +1,13 @@
 #ifndef CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
 #define CUDA_PROGRAMMABLE_BOOTSTRAP_CUH
 
+#include "cooperative_groups.h"
 #include "device.h"
 #include "fft/bnsmfft.cuh"
+#include "helper_multi_gpu.h"
 #include "programmable_bootstrap.h"
 #include "programmable_bootstrap_multibit.h"
 
-#include "cooperative_groups.h"
-#include "helper_multi_gpu.h"
-
 using namespace cooperative_groups;
 namespace cg = cooperative_groups;
 
@@ -118,22 +117,18 @@ mul_ggsw_glwe(Torus *accumulator, double2 *fft, double2 *join_buffer,
 }
 
 template <typename Torus>
-void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
-                 uint32_t gpu_count, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes, std::vector<Torus *> lut_vec,
-                 std::vector<Torus *> lut_indexes_vec, Torus *lwe_array_in,
-                 Torus *lwe_input_indexes, void **bootstrapping_keys,
-                 std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
-                 uint32_t lwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count,
-                 uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-                 uint32_t num_luts, uint32_t lwe_idx,
-                 uint32_t max_shared_memory, PBS_TYPE pbs_type,
-                 bool sync_streams = true) {
-  auto active_gpu_count =
-      get_active_gpu_count(input_lwe_ciphertext_count, gpu_count);
-  if (sync_streams)
-    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+void execute_pbs_async(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    const LweArrayVariant<Torus> &lwe_array_out,
+    const LweArrayVariant<Torus> &lwe_output_indexes,
+    std::vector<Torus *> lut_vec, std::vector<Torus *> lut_indexes_vec,
+    const LweArrayVariant<Torus> &lwe_array_in,
+    const LweArrayVariant<Torus> &lwe_input_indexes, void **bootstrapping_keys,
+    std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
+    uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t grouping_factor,
+    uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
+    uint32_t max_shared_memory, PBS_TYPE pbs_type) {
   switch (sizeof(Torus)) {
   case sizeof(uint32_t):
     // 32 bits
@@ -141,20 +136,32 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
     case MULTI_BIT:
       PANIC("Error: 32-bit multibit PBS is not supported.\n")
     case CLASSICAL:
-#pragma omp parallel for num_threads(active_gpu_count)
-      for (uint i = 0; i < active_gpu_count; i++) {
+      for (uint i = 0; i < gpu_count; i++) {
         int num_inputs_on_gpu =
             get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+
         int gpu_offset =
             get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
         auto d_lut_vector_indexes =
             lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+
+        // Use the macro to get the correct elements for the current iteration
+        // Handles the case when the input/output are scattered through
+        // different gpus and when it is not
+        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+        Torus *current_lwe_output_indexes =
+            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
+        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+        Torus *current_lwe_input_indexes =
+            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+
         cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
-            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
-            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            streams[i], gpu_indexes[i], current_lwe_array_out,
+            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
+            current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
-            lwe_idx, max_shared_memory, gpu_offset);
+            lwe_idx, max_shared_memory);
       }
       break;
     default:
@@ -168,38 +175,61 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
     case MULTI_BIT:
       if (grouping_factor == 0)
         PANIC("Multi-bit PBS error: grouping factor should be > 0.")
-#pragma omp parallel for num_threads(active_gpu_count)
-      for (uint i = 0; i < active_gpu_count; i++) {
+      for (uint i = 0; i < gpu_count; i++) {
         int num_inputs_on_gpu =
             get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+
+        // Use the macro to get the correct elements for the current iteration
+        // Handles the case when the input/output are scattered through
+        // different gpus and when it is not
+        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+        Torus *current_lwe_output_indexes =
+            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
+        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+        Torus *current_lwe_input_indexes =
+            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+
         int gpu_offset =
             get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
         auto d_lut_vector_indexes =
             lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+
         cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
-            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
-            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            streams[i], gpu_indexes[i], current_lwe_array_out,
+            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
+            current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory,
-            gpu_offset);
+            num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory);
       }
       break;
     case CLASSICAL:
-#pragma omp parallel for num_threads(active_gpu_count)
-      for (uint i = 0; i < active_gpu_count; i++) {
+      for (uint i = 0; i < gpu_count; i++) {
         int num_inputs_on_gpu =
             get_num_inputs_on_gpu(input_lwe_ciphertext_count, i, gpu_count);
+
+        // Use the macro to get the correct elements for the current iteration
+        // Handles the case when the input/output are scattered through
+        // different gpus and when it is not
+        Torus *current_lwe_array_out = GET_VARIANT_ELEMENT(lwe_array_out, i);
+        Torus *current_lwe_output_indexes =
+            GET_VARIANT_ELEMENT(lwe_output_indexes, i);
+        Torus *current_lwe_array_in = GET_VARIANT_ELEMENT(lwe_array_in, i);
+        Torus *current_lwe_input_indexes =
+            GET_VARIANT_ELEMENT(lwe_input_indexes, i);
+
         int gpu_offset =
             get_gpu_offset(input_lwe_ciphertext_count, i, gpu_count);
         auto d_lut_vector_indexes =
             lut_indexes_vec[i] + (ptrdiff_t)(gpu_offset);
+
         cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
-            streams[i], gpu_indexes[i], lwe_array_out, lwe_output_indexes,
-            lut_vec[i], d_lut_vector_indexes, lwe_array_in, lwe_input_indexes,
+            streams[i], gpu_indexes[i], current_lwe_array_out,
+            current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
+            current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
-            lwe_idx, max_shared_memory, gpu_offset);
+            lwe_idx, max_shared_memory);
       }
       break;
     default:
@@ -210,11 +240,6 @@ void execute_pbs(cudaStream_t *streams, uint32_t *gpu_indexes,
     PANIC("Cuda error: unsupported modulus size: only 32 and 64 bit integer "
           "moduli are supported.")
   }
-
-  if (sync_streams)
-    for (uint i = 0; i < active_gpu_count; i++) {
-      cuda_synchronize_stream(streams[i], gpu_indexes[i]);
-    }
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
index de2f01d5b..89c1d2a9c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -158,7 +158,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t max_shared_memory) {
 
   if (base_log > 32)
     PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -172,7 +172,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
@@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
@@ -190,7 +190,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
@@ -199,7 +199,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
@@ -208,7 +208,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
@@ -217,7 +217,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
@@ -226,7 +226,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -307,7 +307,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t max_shared_memory) {
 
   if (base_log > 64)
     PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -321,7 +321,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
@@ -330,7 +330,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
@@ -339,7 +339,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
@@ -348,7 +348,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
@@ -357,7 +357,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
@@ -366,7 +366,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
@@ -375,7 +375,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_samples, num_luts, lwe_idx, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
index 0f1f98a2a..48536f137 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -55,7 +55,7 @@ __global__ void device_programmable_bootstrap_amortized(
     const double2 *__restrict__ bootstrapping_key, int8_t *device_mem,
     uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
     uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
-    size_t device_memory_size_per_sample, uint32_t gpu_offset) {
+    size_t device_memory_size_per_sample) {
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
   // much faster than global memory
@@ -82,8 +82,7 @@ __global__ void device_programmable_bootstrap_amortized(
                       (ptrdiff_t)((glwe_dimension + 1) * polynomial_size / 2);
 
   auto block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.x + gpu_offset] *
-                    (lwe_dimension + 1)];
+      &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
   const Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
                   (glwe_dimension + 1)];
@@ -201,7 +200,7 @@ __global__ void device_programmable_bootstrap_amortized(
   }
 
   auto block_lwe_array_out =
-      &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
+      &lwe_array_out[lwe_output_indexes[blockIdx.x] *
                      (glwe_dimension * polynomial_size + 1)];
 
   // The blind rotation for this block is over
@@ -302,7 +301,7 @@ __host__ void host_programmable_bootstrap_amortized(
     int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t max_shared_memory) {
 
   cudaSetDevice(gpu_index);
   uint64_t SM_FULL =
@@ -336,14 +335,14 @@ __host__ void host_programmable_bootstrap_amortized(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
             glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, DM_FULL, gpu_offset);
+            level_count, lwe_idx, DM_FULL);
   } else if (max_shared_memory < SM_FULL) {
     device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
         <<<grid, thds, SM_PART, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
             glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, DM_PART, gpu_offset);
+            level_count, lwe_idx, DM_PART);
   } else {
     // For devices with compute capability 7.x a single thread block can
     // address the full capacity of shared memory. Shared memory on the
@@ -355,7 +354,7 @@ __host__ void host_programmable_bootstrap_amortized(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
             glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, 0, gpu_offset);
+            level_count, lwe_idx, 0);
   }
   check_cuda_error(cudaGetLastError());
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 681ce1391..a7a14d996 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -44,7 +44,7 @@ __global__ void device_programmable_bootstrap_cg(
     const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block) {
 
   grid_group grid = this_grid();
 
@@ -78,8 +78,7 @@ __global__ void device_programmable_bootstrap_cg(
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
   const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
@@ -144,7 +143,7 @@ __global__ void device_programmable_bootstrap_cg(
   }
 
   auto block_lwe_array_out =
-      &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
+      &lwe_array_out[lwe_output_indexes[blockIdx.z] *
                          (glwe_dimension * polynomial_size + 1) +
                      blockIdx.y * polynomial_size];
 
@@ -206,7 +205,7 @@ __host__ void host_programmable_bootstrap_cg(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t num_luts, uint32_t max_shared_memory) {
   cudaSetDevice(gpu_index);
 
   // With SM each block corresponds to either the mask or body, no need to
@@ -228,7 +227,7 @@ __host__ void host_programmable_bootstrap_cg(
   int thds = polynomial_size / params::opt;
   dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
 
-  void *kernel_args[15];
+  void *kernel_args[14];
   kernel_args[0] = &lwe_array_out;
   kernel_args[1] = &lwe_output_indexes;
   kernel_args[2] = &lut_vector;
@@ -242,7 +241,6 @@ __host__ void host_programmable_bootstrap_cg(
   kernel_args[10] = &base_log;
   kernel_args[11] = &level_count;
   kernel_args[12] = &d_mem;
-  kernel_args[14] = &gpu_offset;
 
   if (max_shared_memory < partial_sm) {
     kernel_args[13] = &full_dm;
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index 18cce4e8e..59dfd2abf 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -29,7 +29,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
     uint32_t keybundle_size_per_input, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block) {
 
   grid_group grid = this_grid();
 
@@ -58,8 +58,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
   const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
@@ -121,7 +120,7 @@ __global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
 
   if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
     auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
+        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
                            (glwe_dimension * polynomial_size + 1) +
                        blockIdx.y * polynomial_size];
 
@@ -262,8 +261,7 @@ __host__ void execute_cg_external_product_loop(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset,
-    uint32_t gpu_offset) {
+    uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) {
 
   cudaSetDevice(gpu_index);
   uint64_t full_dm =
@@ -286,7 +284,7 @@ __host__ void execute_cg_external_product_loop(
   auto global_accumulator = buffer->global_accumulator;
   auto buffer_fft = buffer->global_accumulator_fft;
 
-  void *kernel_args[21];
+  void *kernel_args[20];
   kernel_args[0] = &lwe_array_out;
   kernel_args[1] = &lwe_output_indexes;
   kernel_args[2] = &lut_vector;
@@ -306,7 +304,6 @@ __host__ void execute_cg_external_product_loop(
   kernel_args[16] = &chunk_size;
   kernel_args[17] = &keybundle_size_per_input;
   kernel_args[18] = &d_mem;
-  kernel_args[20] = &gpu_offset;
 
   dim3 grid_accumulate(level_count, glwe_dimension + 1, num_samples);
   dim3 thds(polynomial_size / params::opt, 1, 1);
@@ -341,7 +338,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
+    uint32_t lwe_chunk_size = 0) {
   cudaSetDevice(gpu_index);
 
   if (!lwe_chunk_size)
@@ -356,7 +353,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
         grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset, gpu_offset);
+        lwe_chunk_size, lwe_offset);
 
     // Accumulate
     execute_cg_external_product_loop<Torus, params>(
@@ -364,7 +361,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
         lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
         num_samples, lwe_dimension, glwe_dimension, polynomial_size,
         grouping_factor, base_log, level_count, lwe_chunk_size,
-        max_shared_memory, lwe_offset, gpu_offset);
+        max_shared_memory, lwe_offset);
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
index 8f2a59e80..2875491d3 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -132,7 +132,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t lwe_idx, uint32_t max_shared_memory) {
 
   switch (polynomial_size) {
   case 256:
@@ -141,7 +141,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap_tbc<Torus, Degree<512>>(
@@ -149,7 +149,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
@@ -157,7 +157,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
@@ -165,7 +165,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
@@ -173,7 +173,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
@@ -181,7 +181,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
@@ -189,7 +189,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -411,7 +411,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t lwe_idx, uint32_t max_shared_memory) {
 
   switch (polynomial_size) {
   case 256:
@@ -420,7 +420,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap_cg<Torus, Degree<512>>(
@@ -428,7 +428,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap_cg<Torus, Degree<1024>>(
@@ -436,7 +436,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
@@ -444,7 +444,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
@@ -452,7 +452,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
@@ -460,7 +460,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
@@ -468,7 +468,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -485,7 +485,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t lwe_idx, uint32_t max_shared_memory) {
 
   switch (polynomial_size) {
   case 256:
@@ -494,7 +494,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap<Torus, Degree<512>>(
@@ -502,7 +502,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap<Torus, Degree<1024>>(
@@ -510,7 +510,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -518,7 +518,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -526,7 +526,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -534,7 +534,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -542,7 +542,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory, gpu_offset);
+        num_luts, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -560,7 +560,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t max_shared_memory) {
 
   if (base_log > 32)
     PANIC("Cuda error (classical PBS): base log should be > number of bits "
@@ -582,7 +582,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<double2 *>(bootstrapping_key),
         (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_luts, lwe_idx, max_shared_memory);
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
@@ -598,7 +598,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<double2 *>(bootstrapping_key),
         (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_luts, lwe_idx, max_shared_memory);
     break;
   case DEFAULT:
     cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -611,7 +611,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<double2 *>(bootstrapping_key),
         (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_luts, lwe_idx, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -697,7 +697,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t max_shared_memory) {
   if (base_log > 64)
     PANIC("Cuda error (classical PBS): base log should be > number of bits "
           "in the ciphertext representation (64)");
@@ -718,7 +718,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<double2 *>(bootstrapping_key),
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_luts, lwe_idx, max_shared_memory);
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
@@ -734,7 +734,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<double2 *>(bootstrapping_key),
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_luts, lwe_idx, max_shared_memory);
     break;
   case PBS_VARIANT::DEFAULT:
     cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -747,7 +747,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<double2 *>(bootstrapping_key),
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory, gpu_offset);
+        num_luts, lwe_idx, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -777,7 +777,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 
 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -787,7 +787,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 
 template void scratch_cuda_programmable_bootstrap_cg<uint64_t, int64_t>(
     void *stream, uint32_t gpu_index,
@@ -810,7 +810,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
     pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 
 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
     void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -820,7 +820,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
     pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 
 template void scratch_cuda_programmable_bootstrap_cg<uint32_t, int32_t>(
     void *stream, uint32_t gpu_index,
@@ -851,7 +851,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
     pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t *lwe_output_indexes, uint64_t *lut_vector,
@@ -860,7 +860,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset);
+    uint32_t lwe_idx, uint32_t max_shared_memory);
 template void scratch_cuda_programmable_bootstrap_tbc<uint32_t, int32_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index 0f4dec5fc..241370906 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -26,7 +26,7 @@ __global__ void device_programmable_bootstrap_step_one(
     double2 *global_accumulator_fft, uint32_t lwe_iteration,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block) {
 
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -54,8 +54,7 @@ __global__ void device_programmable_bootstrap_step_one(
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
   const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
@@ -140,7 +139,7 @@ __global__ void device_programmable_bootstrap_step_two(
     double2 *global_accumulator_fft, uint32_t lwe_iteration,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block) {
 
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -206,7 +205,7 @@ __global__ void device_programmable_bootstrap_step_two(
   if (lwe_iteration + 1 == lwe_dimension) {
     // Last iteration
     auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
                            (glwe_dimension * polynomial_size + 1) +
                        blockIdx.y * polynomial_size];
 
@@ -331,17 +330,15 @@ __host__ void scratch_programmable_bootstrap(
 }
 
 template <typename Torus, class params>
-__host__ void
-execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
-                 Torus *lut_vector_indexes, Torus *lwe_array_in,
-                 Torus *lwe_input_indexes, double2 *bootstrapping_key,
-                 Torus *global_accumulator, double2 *global_accumulator_fft,
-                 uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
-                 uint32_t glwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count, int8_t *d_mem,
-                 uint32_t max_shared_memory, int lwe_iteration,
-                 uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
-                 uint64_t full_dm, uint32_t gpu_offset) {
+__host__ void execute_step_one(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
+    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
+    double2 *bootstrapping_key, Torus *global_accumulator,
+    double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, int8_t *d_mem,
+    uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm,
+    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
 
   cudaSetDevice(gpu_index);
   int thds = polynomial_size / params::opt;
@@ -353,37 +350,35 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
             bootstrapping_key, global_accumulator, global_accumulator_fft,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm, gpu_offset);
+            level_count, d_mem, full_dm);
   } else if (max_shared_memory < full_sm) {
     device_programmable_bootstrap_step_one<Torus, params, PARTIALSM>
         <<<grid, thds, partial_sm, stream>>>(
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
             bootstrapping_key, global_accumulator, global_accumulator_fft,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm, gpu_offset);
+            level_count, d_mem, partial_dm);
   } else {
     device_programmable_bootstrap_step_one<Torus, params, FULLSM>
         <<<grid, thds, full_sm, stream>>>(
             lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes,
             bootstrapping_key, global_accumulator, global_accumulator_fft,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0, gpu_offset);
+            level_count, d_mem, 0);
   }
   check_cuda_error(cudaGetLastError());
 }
 
 template <typename Torus, class params>
-__host__ void
-execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes, Torus *lut_vector,
-                 Torus *lut_vector_indexes, double2 *bootstrapping_key,
-                 Torus *global_accumulator, double2 *global_accumulator_fft,
-                 uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
-                 uint32_t glwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count, int8_t *d_mem,
-                 uint32_t max_shared_memory, int lwe_iteration,
-                 uint64_t partial_sm, uint64_t partial_dm, uint64_t full_sm,
-                 uint64_t full_dm, uint32_t gpu_offset) {
+__host__ void execute_step_two(
+    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
+    double2 *bootstrapping_key, Torus *global_accumulator,
+    double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
+    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
+    uint32_t base_log, uint32_t level_count, int8_t *d_mem,
+    uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm,
+    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
 
   cudaSetDevice(gpu_index);
   int thds = polynomial_size / params::opt;
@@ -395,21 +390,21 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             bootstrapping_key, global_accumulator, global_accumulator_fft,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, full_dm, gpu_offset);
+            level_count, d_mem, full_dm);
   } else if (max_shared_memory < full_sm) {
     device_programmable_bootstrap_step_two<Torus, params, PARTIALSM>
         <<<grid, thds, partial_sm, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             bootstrapping_key, global_accumulator, global_accumulator_fft,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, partial_dm, gpu_offset);
+            level_count, d_mem, partial_dm);
   } else {
     device_programmable_bootstrap_step_two<Torus, params, FULLSM>
         <<<grid, thds, full_sm, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             bootstrapping_key, global_accumulator, global_accumulator_fft,
             lwe_iteration, lwe_dimension, polynomial_size, base_log,
-            level_count, d_mem, 0, gpu_offset);
+            level_count, d_mem, 0);
   }
   check_cuda_error(cudaGetLastError());
 }
@@ -424,7 +419,7 @@ __host__ void host_programmable_bootstrap(
     pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t num_luts, uint32_t max_shared_memory) {
   cudaSetDevice(gpu_index);
 
   // With SM each block corresponds to either the mask or body, no need to
@@ -455,14 +450,14 @@ __host__ void host_programmable_bootstrap(
         global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, d_mem,
         max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one,
-        full_dm_step_one, gpu_offset);
+        full_dm_step_one);
     execute_step_two<Torus, params>(
         stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
         lut_vector_indexes, bootstrapping_key, global_accumulator,
         global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, d_mem,
         max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two,
-        full_dm_step_two, gpu_offset);
+        full_dm_step_two);
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
index f04a3c8be..e994e203a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -75,7 +75,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size) {
+    uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -89,8 +89,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 512:
     host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -99,8 +98,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 1024:
     host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -109,8 +107,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 2048:
     host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -119,8 +116,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 4096:
     host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -129,8 +125,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 8192:
     host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -139,8 +134,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 16384:
     host_cg_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -149,8 +143,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -168,7 +161,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size) {
+    uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -182,8 +175,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 512:
     host_multi_bit_programmable_bootstrap<Torus, int64_t, AmortizedDegree<512>>(
@@ -191,8 +183,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 1024:
     host_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -201,8 +192,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 2048:
     host_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -211,8 +201,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 4096:
     host_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -221,8 +210,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 8192:
     host_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -231,8 +219,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 16384:
     host_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -241,8 +228,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -258,8 +244,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
     uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t gpu_offset,
-    uint32_t lwe_chunk_size) {
+    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
 
   pbs_buffer<uint64_t, MULTI_BIT> *buffer =
       (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
@@ -277,8 +262,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(bootstrapping_key),
         (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
 #else
     PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
 #endif
@@ -293,8 +277,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case PBS_VARIANT::DEFAULT:
     cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -306,8 +289,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -575,7 +557,7 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+    uint32_t lwe_chunk_size);
 
 template void
 scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t, int64_t>(
@@ -595,7 +577,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+    uint32_t lwe_chunk_size);
 
 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -683,7 +665,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size) {
+    uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -697,8 +679,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 512:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -707,8 +688,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 1024:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -717,8 +697,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 2048:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -727,8 +706,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 4096:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -737,8 +715,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 8192:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -747,8 +724,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   case 16384:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -757,8 +733,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, gpu_offset,
-        lwe_chunk_size);
+        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -785,5 +760,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size);
+    uint32_t lwe_chunk_size);
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index b2069d2f6..4e5b5fca2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -40,8 +40,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t lwe_offset,
     uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
-    int8_t *device_mem, uint64_t device_memory_size_per_block,
-    uint32_t gpu_offset) {
+    int8_t *device_mem, uint64_t device_memory_size_per_block) {
 
   extern __shared__ int8_t sharedmem[];
   int8_t *selected_memory = sharedmem;
@@ -66,8 +65,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
     Torus *accumulator = (Torus *)selected_memory;
 
     const Torus *block_lwe_array_in =
-        &lwe_array_in[lwe_input_indexes[input_idx + gpu_offset] *
-                      (lwe_dimension + 1)];
+        &lwe_array_in[lwe_input_indexes[input_idx] * (lwe_dimension + 1)];
 
     double2 *keybundle = keybundle_array +
                          // select the input
@@ -157,7 +155,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
     double2 *global_accumulator_fft, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block) {
 
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -184,8 +182,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
     accumulator_fft = (double2 *)sharedmem;
 
   const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
@@ -254,7 +251,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
     uint32_t lwe_chunk_size, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block) {
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
   // much faster than global memory
@@ -318,7 +315,7 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
   if (lwe_iteration + 1 == (lwe_dimension / grouping_factor)) {
     // Last iteration
     auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.x + gpu_offset] *
+        &lwe_array_out[lwe_output_indexes[blockIdx.x] *
                            (glwe_dimension * polynomial_size + 1) +
                        blockIdx.y * polynomial_size];
 
@@ -492,8 +489,7 @@ __host__ void execute_compute_keybundle(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset,
-    uint32_t gpu_offset) {
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset) {
 
   cudaSetDevice(gpu_index);
   uint32_t chunk_size =
@@ -521,27 +517,26 @@ __host__ void execute_compute_keybundle(
             lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
             lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
             base_log, level_count, lwe_offset, chunk_size,
-            keybundle_size_per_input, d_mem, full_sm_keybundle, gpu_offset);
+            keybundle_size_per_input, d_mem, full_sm_keybundle);
   else
     device_multi_bit_programmable_bootstrap_keybundle<Torus, params, FULLSM>
         <<<grid_keybundle, thds, full_sm_keybundle, stream>>>(
             lwe_array_in, lwe_input_indexes, keybundle_fft, bootstrapping_key,
             lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
             base_log, level_count, lwe_offset, chunk_size,
-            keybundle_size_per_input, d_mem, 0, gpu_offset);
+            keybundle_size_per_input, d_mem, 0);
   check_cuda_error(cudaGetLastError());
 }
 
 template <typename Torus, class params>
-__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
-                               Torus *lut_vector, Torus *lut_vector_indexes,
-                               Torus *lwe_array_in, Torus *lwe_input_indexes,
-                               pbs_buffer<Torus, MULTI_BIT> *buffer,
-                               uint32_t num_samples, uint32_t lwe_dimension,
-                               uint32_t glwe_dimension,
-                               uint32_t polynomial_size, uint32_t base_log,
-                               uint32_t level_count, uint32_t max_shared_memory,
-                               int j, int lwe_offset, uint32_t gpu_offset) {
+__host__ void
+execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
+                 Torus *lut_vector_indexes, Torus *lwe_array_in,
+                 Torus *lwe_input_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
+                 uint32_t num_samples, uint32_t lwe_dimension,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t base_log, uint32_t level_count,
+                 uint32_t max_shared_memory, int j, int lwe_offset) {
 
   cudaSetDevice(gpu_index);
   uint64_t full_sm_accumulate_step_one =
@@ -566,7 +561,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
             lwe_array_in, lwe_input_indexes, lut_vector, lut_vector_indexes,
             global_accumulator, global_accumulator_fft, lwe_dimension,
             glwe_dimension, polynomial_size, base_log, level_count,
-            j + lwe_offset, d_mem, full_sm_accumulate_step_one, gpu_offset);
+            j + lwe_offset, d_mem, full_sm_accumulate_step_one);
   else if (max_shared_memory < full_sm_accumulate_step_one)
     device_multi_bit_programmable_bootstrap_accumulate_step_one<Torus, params,
                                                                 PARTIALSM>
@@ -575,7 +570,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
                      lut_vector_indexes, global_accumulator,
                      global_accumulator_fft, lwe_dimension, glwe_dimension,
                      polynomial_size, base_log, level_count, j + lwe_offset,
-                     d_mem, partial_sm_accumulate_step_one, gpu_offset);
+                     d_mem, partial_sm_accumulate_step_one);
   else
     device_multi_bit_programmable_bootstrap_accumulate_step_one<Torus, params,
                                                                 FULLSM>
@@ -584,7 +579,7 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
                      lut_vector_indexes, global_accumulator,
                      global_accumulator_fft, lwe_dimension, glwe_dimension,
                      polynomial_size, base_log, level_count, j + lwe_offset,
-                     d_mem, 0, gpu_offset);
+                     d_mem, 0);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -596,7 +591,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
                  uint32_t lwe_dimension, uint32_t glwe_dimension,
                  uint32_t polynomial_size, int32_t grouping_factor,
                  uint32_t level_count, uint32_t max_shared_memory, int j,
-                 int lwe_offset, uint32_t lwe_chunk_size, uint32_t gpu_offset) {
+                 int lwe_offset, uint32_t lwe_chunk_size) {
 
   cudaSetDevice(gpu_index);
   uint64_t full_sm_accumulate_step_two =
@@ -618,8 +613,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
             lwe_array_out, lwe_output_indexes, keybundle_fft,
             global_accumulator, global_accumulator_fft, lwe_dimension,
             glwe_dimension, polynomial_size, level_count, grouping_factor, j,
-            lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two,
-            gpu_offset);
+            lwe_offset, lwe_chunk_size, d_mem, full_sm_accumulate_step_two);
   else
     device_multi_bit_programmable_bootstrap_accumulate_step_two<Torus, params,
                                                                 FULLSM>
@@ -627,8 +621,7 @@ execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
            stream>>>(lwe_array_out, lwe_output_indexes, keybundle_fft,
                      global_accumulator, global_accumulator_fft, lwe_dimension,
                      glwe_dimension, polynomial_size, level_count,
-                     grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0,
-                     gpu_offset);
+                     grouping_factor, j, lwe_offset, lwe_chunk_size, d_mem, 0);
   check_cuda_error(cudaGetLastError());
 }
 
@@ -641,7 +634,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
+    uint32_t lwe_chunk_size = 0) {
   cudaSetDevice(gpu_index);
 
   // If a chunk size is not passed to this function, select one.
@@ -657,7 +650,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
         grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset, gpu_offset);
+        lwe_chunk_size, lwe_offset);
     // Accumulate
     uint32_t chunk_size = std::min(
         lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
@@ -666,13 +659,13 @@ __host__ void host_multi_bit_programmable_bootstrap(
           stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
           polynomial_size, base_log, level_count, max_shared_memory, j,
-          lwe_offset, gpu_offset);
+          lwe_offset);
 
       execute_step_two<Torus, params>(
           stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
           num_samples, lwe_dimension, glwe_dimension, polynomial_size,
           grouping_factor, level_count, max_shared_memory, j, lwe_offset,
-          lwe_chunk_size, gpu_offset);
+          lwe_chunk_size);
     }
   }
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index ff379bb91..ff5651806 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -44,8 +44,7 @@ __global__ void device_programmable_bootstrap_tbc(
     const double2 *__restrict__ bootstrapping_key, double2 *join_buffer,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, bool support_dsm,
-    uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block, bool support_dsm) {
 
   cluster_group cluster = this_cluster();
 
@@ -82,8 +81,7 @@ __global__ void device_programmable_bootstrap_tbc(
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
   const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
@@ -148,7 +146,7 @@ __global__ void device_programmable_bootstrap_tbc(
   }
 
   auto block_lwe_array_out =
-      &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
+      &lwe_array_out[lwe_output_indexes[blockIdx.z] *
                          (glwe_dimension * polynomial_size + 1) +
                      blockIdx.y * polynomial_size];
 
@@ -229,7 +227,7 @@ __host__ void host_programmable_bootstrap_tbc(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory, uint32_t gpu_offset) {
+    uint32_t num_luts, uint32_t max_shared_memory) {
   cudaSetDevice(gpu_index);
 
   auto supports_dsm =
@@ -283,7 +281,7 @@ __host__ void host_programmable_bootstrap_tbc(
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
         lwe_dimension, polynomial_size, base_log, level_count, d_mem, full_dm,
-        supports_dsm, gpu_offset));
+        supports_dsm));
   } else if (max_shared_memory < full_sm + minimum_sm_tbc) {
     config.dynamicSmemBytes = partial_sm + minimum_sm_tbc;
 
@@ -292,7 +290,7 @@ __host__ void host_programmable_bootstrap_tbc(
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
         lwe_dimension, polynomial_size, base_log, level_count, d_mem,
-        partial_dm, supports_dsm, gpu_offset));
+        partial_dm, supports_dsm));
   } else {
     config.dynamicSmemBytes = full_sm + minimum_sm_tbc;
 
@@ -301,7 +299,7 @@ __host__ void host_programmable_bootstrap_tbc(
         lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
         lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer_fft,
         lwe_dimension, polynomial_size, base_log, level_count, d_mem, 0,
-        supports_dsm, gpu_offset));
+        supports_dsm));
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 23f74a2d6..0aa711543 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -29,8 +29,7 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
     uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
     uint32_t keybundle_size_per_input, int8_t *device_mem,
-    uint64_t device_memory_size_per_block, bool support_dsm,
-    uint32_t gpu_offset) {
+    uint64_t device_memory_size_per_block, bool support_dsm) {
 
   cluster_group cluster = this_cluster();
 
@@ -66,8 +65,7 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
   // The third dimension of the block is used to determine on which ciphertext
   // this block is operating, in the case of batch bootstraps
   const Torus *block_lwe_array_in =
-      &lwe_array_in[lwe_input_indexes[blockIdx.z + gpu_offset] *
-                    (lwe_dimension + 1)];
+      &lwe_array_in[lwe_input_indexes[blockIdx.z] * (lwe_dimension + 1)];
 
   const Torus *block_lut_vector =
       &lut_vector[lut_vector_indexes[blockIdx.z] * params::degree *
@@ -129,7 +127,7 @@ __global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
 
   if (lwe_offset + lwe_chunk_size >= (lwe_dimension / grouping_factor)) {
     auto block_lwe_array_out =
-        &lwe_array_out[lwe_output_indexes[blockIdx.z + gpu_offset] *
+        &lwe_array_out[lwe_output_indexes[blockIdx.z] *
                            (glwe_dimension * polynomial_size + 1) +
                        blockIdx.y * polynomial_size];
 
@@ -273,8 +271,7 @@ __host__ void execute_tbc_external_product_loop(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset,
-    uint32_t gpu_offset) {
+    uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) {
 
   cudaSetDevice(gpu_index);
   auto supports_dsm =
@@ -334,7 +331,7 @@ __host__ void execute_tbc_external_product_loop(
         lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
         global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
         base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, full_dm, supports_dsm, gpu_offset));
+        keybundle_size_per_input, d_mem, full_dm, supports_dsm));
   } else if (max_shared_memory < full_dm + minimum_dm) {
     config.dynamicSmemBytes = partial_dm + minimum_dm;
     check_cuda_error(cudaLaunchKernelEx(
@@ -345,7 +342,7 @@ __host__ void execute_tbc_external_product_loop(
         lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
         global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
         base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, partial_dm, supports_dsm, gpu_offset));
+        keybundle_size_per_input, d_mem, partial_dm, supports_dsm));
   } else {
     config.dynamicSmemBytes = full_dm + minimum_dm;
     check_cuda_error(cudaLaunchKernelEx(
@@ -356,7 +353,7 @@ __host__ void execute_tbc_external_product_loop(
         lwe_array_in, lwe_input_indexes, keybundle_fft, buffer_fft,
         global_accumulator, lwe_dimension, glwe_dimension, polynomial_size,
         base_log, level_count, grouping_factor, lwe_offset, chunk_size,
-        keybundle_size_per_input, d_mem, 0, supports_dsm, gpu_offset));
+        keybundle_size_per_input, d_mem, 0, supports_dsm));
   }
 }
 
@@ -369,7 +366,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
     uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t gpu_offset, uint32_t lwe_chunk_size = 0) {
+    uint32_t lwe_chunk_size = 0) {
   cudaSetDevice(gpu_index);
 
   if (!lwe_chunk_size)
@@ -384,7 +381,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
         grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset, gpu_offset);
+        lwe_chunk_size, lwe_offset);
 
     // Accumulate
     execute_tbc_external_product_loop<Torus, params>(
@@ -392,7 +389,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
         lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
         num_samples, lwe_dimension, glwe_dimension, polynomial_size,
         grouping_factor, base_log, level_count, lwe_chunk_size,
-        max_shared_memory, lwe_offset, gpu_offset);
+        max_shared_memory, lwe_offset);
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
index 8e8189e70..a6d6cdd54 100644
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -1,5 +1,5 @@
 #include "device.h"
-#include "helper_multi_gpu.h"
+#include "helper_multi_gpu.cuh"
 #include <mutex>
 #include <omp.h>
 
@@ -21,34 +21,14 @@ int cuda_setup_multi_gpu() {
         check_cuda_error(
             cudaDeviceCanAccessPeer(&has_peer_access_to_device_0, i, 0));
         if (has_peer_access_to_device_0) {
-          cudaMemPool_t mempool;
-          cudaMemAccessDesc desc = {};
-          // Enable P2P Access and mempool access
           check_cuda_error(cudaSetDevice(i));
           check_cuda_error(cudaDeviceEnablePeerAccess(0, 0));
-
-          check_cuda_error(cudaDeviceGetDefaultMemPool(&mempool, 0));
-          desc.location.type = cudaMemLocationTypeDevice;
-          desc.location.id = i;
-          desc.flags = cudaMemAccessFlagsProtReadWrite;
-          check_cuda_error(
-              cudaMemPoolSetAccess(mempool, &desc, 1 /* numDescs */));
-          num_used_gpus += 1;
-        } else {
-          break;
         }
+        num_used_gpus += 1;
       }
     } else {
-      int has_peer_access_to_device_0;
-      for (int i = 1; i < num_gpus; i++) {
-        check_cuda_error(
-            cudaDeviceCanAccessPeer(&has_peer_access_to_device_0, i, 0));
-        if (has_peer_access_to_device_0) {
-          num_used_gpus += 1;
-        } else {
-          break;
-        }
-      }
+      for (int i = 1; i < num_gpus; i++)
+        num_used_gpus += 1;
     }
     m.unlock();
   }
diff --git a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
new file mode 100644
index 000000000..08cb9dfe9
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -0,0 +1,133 @@
+#ifndef HELPER_MULTI_GPU_CUH
+#define HELPER_MULTI_GPU_CUH
+
+#include "helper_multi_gpu.h"
+
+/// Initialize same-size arrays on all active gpus
+template <typename Torus>
+void multi_gpu_alloc_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, std::vector<Torus *> &dest,
+                                 uint32_t elements_per_gpu) {
+
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    Torus *d_array = (Torus *)cuda_malloc_async(
+        elements_per_gpu * sizeof(Torus), streams[i], gpu_indexes[i]);
+    dest[i] = d_array;
+  }
+}
+/// Copy an array residing on one GPU to all active gpus
+template <typename Torus>
+void multi_gpu_copy_array_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                uint32_t gpu_count, std::vector<Torus *> &dest,
+                                Torus *src, uint32_t elements_per_gpu) {
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    cuda_memcpy_async_gpu_to_gpu(dest[i], src, elements_per_gpu * sizeof(Torus),
+                                 streams[i], gpu_indexes[i]);
+  }
+}
+/// Allocates the input/output vector for all devices
+/// Initializes also the related indexing and initializes it to the trivial
+/// index
+template <typename Torus>
+void multi_gpu_alloc_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                               uint32_t gpu_count, std::vector<Torus *> &dest,
+                               uint32_t num_inputs, uint32_t lwe_size) {
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    Torus *d_array = (Torus *)cuda_malloc_async(
+        inputs_on_gpu * lwe_size * sizeof(Torus), streams[i], gpu_indexes[i]);
+    dest[i] = d_array;
+  }
+}
+
+/// Load an array residing on one GPU to all active gpus
+/// and split the array among them.
+/// The input indexing logic is given by an index array.
+/// The output indexing is always the trivial one
+template <typename Torus>
+void multi_gpu_scatter_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                 uint32_t gpu_count, std::vector<Torus *> &dest,
+                                 Torus *src, Torus *h_src_indexes,
+                                 bool is_trivial_index, uint32_t num_inputs,
+                                 uint32_t lwe_size) {
+
+  cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+  dest.resize(gpu_count);
+  for (uint i = 0; i < gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto gpu_offset = 0;
+    for (uint j = 0; j < i; j++) {
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+    }
+
+    if (is_trivial_index) {
+      auto d_dest = dest[i];
+      auto d_src = src + gpu_offset * lwe_size;
+      cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                   inputs_on_gpu * lwe_size * sizeof(Torus),
+                                   streams[i], gpu_indexes[i]);
+
+    } else {
+      auto src_indexes = h_src_indexes + gpu_offset;
+
+      for (uint j = 0; j < inputs_on_gpu; j++) {
+        auto d_dest = dest[i] + j * lwe_size;
+        auto d_src = src + src_indexes[j] * lwe_size;
+
+        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus),
+                                     streams[i], gpu_indexes[i]);
+      }
+    }
+  }
+}
+
+/// Copy data from multiple GPUs back to GPU 0 following the indexing given in
+/// dest_indexes
+/// The input indexing should be the trivial one
+template <typename Torus>
+void multi_gpu_gather_lwe_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                                uint32_t gpu_count, Torus *dest,
+                                const std::vector<Torus *> &src,
+                                Torus *h_dest_indexes, bool is_trivial_index,
+                                uint32_t num_inputs, uint32_t lwe_size) {
+
+  for (uint i = 0; i < gpu_count; i++) {
+    auto inputs_on_gpu = get_num_inputs_on_gpu(num_inputs, i, gpu_count);
+    auto gpu_offset = 0;
+    for (uint j = 0; j < i; j++) {
+      gpu_offset += get_num_inputs_on_gpu(num_inputs, j, gpu_count);
+    }
+
+    if (is_trivial_index) {
+      auto d_dest = dest + gpu_offset * lwe_size;
+      auto d_src = src[i];
+
+      cuda_memcpy_async_gpu_to_gpu(d_dest, d_src,
+                                   inputs_on_gpu * lwe_size * sizeof(Torus),
+                                   streams[i], gpu_indexes[i]);
+    } else {
+      auto dest_indexes = h_dest_indexes + gpu_offset;
+
+      for (uint j = 0; j < inputs_on_gpu; j++) {
+        auto d_dest = dest + dest_indexes[j] * lwe_size;
+        auto d_src = src[i] + j * lwe_size;
+
+        cuda_memcpy_async_gpu_to_gpu(d_dest, d_src, lwe_size * sizeof(Torus),
+                                     streams[i], gpu_indexes[i]);
+      }
+    }
+  }
+}
+
+template <typename Torus>
+void multi_gpu_release_async(cudaStream_t *streams, uint32_t *gpu_indexes,
+                             std::vector<Torus *> &vec) {
+
+  for (uint i = 0; i < vec.size(); i++)
+    cuda_drop_async(vec[i], streams[i], gpu_indexes[i]);
+}
+
+#endif
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index e36e3c94d..322be12f2 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -340,7 +340,6 @@ extern "C" {
         base_log: u32,
         level_count: u32,
         num_samples: u32,
-        gpu_offset: u32,
     );
 
     /// Perform the negation of a u64 input LWE ciphertext vector.
diff --git a/ci/ec2_products_cost.json b/ci/ec2_products_cost.json
index 46b9c8328..3e9de534c 100644
--- a/ci/ec2_products_cost.json
+++ b/ci/ec2_products_cost.json
@@ -6,5 +6,6 @@
   "p4d.24xlarge": 32.7726,
   "p5.48xlarge": 98.32,
   "rtx4090": 0.04,
-  "n3-H100x1": 4.30
+  "n3-H100x1": 4.30,
+  "n3-H100x8-NVLink": 22.6
 }
diff --git a/ci/slab.toml b/ci/slab.toml
index bbf2f11dc..4d5203d43 100644
--- a/ci/slab.toml
+++ b/ci/slab.toml
@@ -46,7 +46,7 @@ flavor_name = "n3-H100x1"
 [backend.hyperstack.multi-h100-nvlink]
 environment_name = "canada"
 image_name = "Ubuntu Server 22.04 LTS R535 CUDA 12.2"
-flavor_name = "n3-H100x8-NVLink-K8s"
+flavor_name = "n3-H100x8-NVLink"
 
 [backend.hyperstack.multi-a100-nvlink]
 environment_name = "canada"
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 8eb516f2a..5a54d2a56 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -257,7 +257,6 @@ pub unsafe fn keyswitch_async<T: UnsignedInteger>(
         base_log.0 as u32,
         l_gadget.0 as u32,
         num_samples,
-        0,
     );
 }