refactor(gpu): Threshold for multi-GPU with Classical PBS

2026-01-06 21:34:05 -05:00 · 2025-12-17 14:32:25 +01:00
parent 0a59e86675
commit ca2a79f1fb
27 changed files with 234 additions and 132 deletions
--- a/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/aes/aes_utilities.h
@@ -35,7 +35,8 @@ template <typename Torus> struct int_aes_lut_buffers {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, and_lambda, allocate_gpu_memory);
    auto active_streams_and_lut = streams.active_gpu_subset(
-        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism);
+        SBOX_MAX_AND_GATES * num_aes_inputs * sbox_parallelism,
+        params.pbs_type);
    this->and_lut->broadcast_lut(active_streams_and_lut);
    this->and_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

@@ -50,8 +51,8 @@ template <typename Torus> struct int_aes_lut_buffers {
        this->flush_lut->get_degree(0), this->flush_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, flush_lambda, allocate_gpu_memory);
-    auto active_streams_flush_lut =
-        streams.active_gpu_subset(AES_STATE_BITS * num_aes_inputs);
+    auto active_streams_flush_lut = streams.active_gpu_subset(
+        AES_STATE_BITS * num_aes_inputs, params.pbs_type);
    this->flush_lut->broadcast_lut(active_streams_flush_lut);
    this->flush_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);

@@ -65,7 +66,8 @@ template <typename Torus> struct int_aes_lut_buffers {
        this->carry_lut->get_degree(0), this->carry_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, carry_lambda, allocate_gpu_memory);
-    auto active_streams_carry_lut = streams.active_gpu_subset(num_aes_inputs);
+    auto active_streams_carry_lut =
+        streams.active_gpu_subset(num_aes_inputs, params.pbs_type);
    this->carry_lut->broadcast_lut(active_streams_carry_lut);
    this->carry_lut->setup_gemm_batch_ks_temp_buffers(size_tracker);
  }
--- a/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
+++ b/backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
@@ -8,7 +8,8 @@

 extern std::mutex m;
 extern bool p2p_enabled;
-extern const int THRESHOLD_MULTI_GPU;
+extern const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS;
+extern const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

 extern "C" {
 int32_t cuda_setup_multi_gpu(int device_0_id);
@@ -39,7 +40,8 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
  }
 }

-uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
+uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
+                              PBS_TYPE pbs_type);

 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

@@ -73,9 +75,10 @@ public:

  // Returns a subset of this set as an active subset. An active subset is one
  // that is temporarily used to perform some computation
-  CudaStreams active_gpu_subset(int num_radix_blocks) {
-    return CudaStreams(_streams, _gpu_indexes,
-                       get_active_gpu_count(num_radix_blocks, _gpu_count));
+  CudaStreams active_gpu_subset(int num_radix_blocks, PBS_TYPE pbs_type) {
+    return CudaStreams(
+        _streams, _gpu_indexes,
+        get_active_gpu_count(num_radix_blocks, _gpu_count, pbs_type));
  }

  // Returns a CudaStreams struct containing only the ith stream
--- a/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/bitwise_ops.h
@@ -20,7 +20,8 @@ template <typename Torus> struct boolean_bitop_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->op = op;
    this->params = params;
-    auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
+    auto active_streams =
+        streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
    this->unchecked = is_unchecked;
    switch (op) {
    case BITAND:
@@ -119,7 +120,8 @@ template <typename Torus> struct int_bitop_buffer {
    gpu_memory_allocated = allocate_gpu_memory;
    this->op = op;
    this->params = params;
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    switch (op) {
    case BITAND:
    case BITOR:
@@ -216,7 +218,8 @@ template <typename Torus> struct boolean_bitnot_buffer {
          message_extract_lut->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          lut_f_message_extract, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(lwe_ciphertext_count);
+      auto active_streams =
+          streams.active_gpu_subset(lwe_ciphertext_count, params.pbs_type);
      message_extract_lut->broadcast_lut(active_streams);
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cast.h
@@ -39,7 +39,8 @@ template <typename Torus> struct int_extend_radix_with_sign_msb_buffer {
          },
          allocate_gpu_memory);

-      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      lut->broadcast_lut(active_streams);

      this->last_block = new CudaRadixCiphertextFFI;
--- a/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/cmux.h
@@ -14,7 +14,8 @@ template <typename Torus> struct int_zero_out_if_buffer {
                         uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
    this->params = params;
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

    tmp = new CudaRadixCiphertextFFI;
    create_zero_radix_ciphertext_async<Torus>(
@@ -114,9 +115,11 @@ template <typename Torus> struct int_cmux_buffer {
        predicate_lut->get_lut_indexes(0, 0), h_lut_indexes,
        2 * num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams_pred = streams.active_gpu_subset(2 * num_radix_blocks);
+    auto active_streams_pred =
+        streams.active_gpu_subset(2 * num_radix_blocks, params.pbs_type);
    predicate_lut->broadcast_lut(active_streams_pred);
-    auto active_streams_msg = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams_msg =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    message_extract_lut->broadcast_lut(active_streams_msg);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/comparison.h
@@ -52,7 +52,8 @@ template <typename Torus> struct int_are_all_block_true_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, is_max_value_f, gpu_memory_allocated);

-    auto active_streams = streams.active_gpu_subset(max_chunks);
+    auto active_streams =
+        streams.active_gpu_subset(max_chunks, params.pbs_type);
    is_max_value->broadcast_lut(active_streams);
  }

@@ -108,7 +109,8 @@ template <typename Torus> struct int_comparison_eq_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);

-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    is_non_zero_lut->broadcast_lut(active_streams);

    // Scalar may have up to num_radix_blocks blocks
@@ -238,7 +240,8 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
        tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        block_selector_f, gpu_memory_allocated);
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    tree_inner_leaf_lut->broadcast_lut(active_streams);
  }

@@ -390,7 +393,8 @@ template <typename Torus> struct int_comparison_buffer {
    this->op = op;
    this->is_signed = is_signed;

-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

    identity_lut_f = [](Torus x) -> Torus { return x; };

@@ -523,7 +527,7 @@ template <typename Torus> struct int_comparison_buffer {
          signed_lut->get_degree(0), signed_lut->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, signed_lut_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(1);
+      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      signed_lut->broadcast_lut(active_streams);
    }
    preallocated_h_lut = (Torus *)malloc(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h
@@ -116,7 +116,8 @@ template <typename Torus> struct int_decompression {
          effective_compression_carry_modulus,
          encryption_params.message_modulus, encryption_params.carry_modulus,
          decompression_rescale_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
+      auto active_streams = streams.active_gpu_subset(
+          num_blocks_to_decompress, decompression_rescale_lut->params.pbs_type);
      decompression_rescale_lut->broadcast_lut(active_streams);
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/div_rem.h
@@ -356,7 +356,8 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
          luts[j]->get_degree(0), luts[j]->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, lut_f_message_extract, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(num_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_blocks, params.pbs_type);
      luts[j]->broadcast_lut(active_streams);
    }
  }
@@ -1012,7 +1013,7 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          masking_luts_1[i]->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          lut_f_masking, gpu_memory_allocated);
-      auto active_streams_1 = streams.active_gpu_subset(1);
+      auto active_streams_1 = streams.active_gpu_subset(1, params.pbs_type);
      masking_luts_1[i]->broadcast_lut(active_streams_1);

      generate_device_accumulator<Torus>(
@@ -1021,7 +1022,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
          masking_luts_2[i]->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          lut_f_masking, gpu_memory_allocated);
-      auto active_streams_2 = streams.active_gpu_subset(num_blocks);
+      auto active_streams_2 =
+          streams.active_gpu_subset(num_blocks, params.pbs_type);
      masking_luts_2[i]->broadcast_lut(active_streams_2);
    }

@@ -1040,7 +1042,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    int_radix_lut<Torus> *luts[2] = {message_extract_lut_1,
                                     message_extract_lut_2};
-    auto active_streams = streams.active_gpu_subset(num_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);
    for (int j = 0; j < 2; j++) {
      generate_device_accumulator<Torus>(
          streams.stream(0), streams.gpu_index(0), luts[j]->get_lut(0, 0),
@@ -1128,7 +1131,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {

    // merge_overflow_flags_luts
    merge_overflow_flags_luts = new int_radix_lut<Torus> *[num_bits_in_message];
-    auto active_gpu_count_for_bits = streams.active_gpu_subset(1);
+    auto active_gpu_count_for_bits =
+        streams.active_gpu_subset(1, params.pbs_type);
    for (int i = 0; i < num_bits_in_message; i++) {
      auto lut_f_bit = [i](Torus x, Torus y) -> Torus {
        return (x == 0 && y == 0) << i;
@@ -1152,7 +1156,8 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
                              uint32_t num_blocks, bool allocate_gpu_memory,
                              uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;
-    auto active_streams = streams.active_gpu_subset(2 * num_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(2 * num_blocks, params.pbs_type);
    this->params = params;

    if (params.message_modulus == 4 && params.carry_modulus == 4 &&
@@ -1473,7 +1478,8 @@ template <typename Torus> struct int_div_rem_memory {
                     bool allocate_gpu_memory, uint64_t &size_tracker) {

    gpu_memory_allocated = allocate_gpu_memory;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);
    this->params = params;
    this->is_signed = is_signed;

@@ -1559,7 +1565,7 @@ template <typename Torus> struct int_div_rem_memory {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          f_compare_extracted_signed_bits, gpu_memory_allocated);
      auto active_gpu_count_cmp =
-          streams.active_gpu_subset(1); // only 1 block needed
+          streams.active_gpu_subset(1, params.pbs_type); // only 1 block needed
      compare_signed_bits_lut->broadcast_lut(active_gpu_count_cmp);
    }
  }
--- a/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/ilog2.h
@@ -20,7 +20,8 @@ template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
    this->allocate_gpu_memory = allocate_gpu_memory;
    this->direction = direction;
    this->bit_value = bit_value;
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    this->univ_lut_mem =
        new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
                                 allocate_gpu_memory, size_tracker);
@@ -246,7 +247,8 @@ template <typename Torus> struct int_ilog2_buffer {
                                params.glwe_dimension, params.polynomial_size,
                                params.message_modulus, params.carry_modulus,
                                lut_message_lambda, allocate_gpu_memory);
-    auto active_streams = streams.active_gpu_subset(counter_num_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(counter_num_blocks, params.pbs_type);
    lut_message_not->broadcast_lut(active_streams);

    this->lut_carry_not =
--- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -371,7 +371,8 @@ struct int_radix_lut_custom_input_output {
    this->num_input_blocks = num_input_blocks;
    this->gpu_memory_allocated = allocate_gpu_memory;

-    this->active_streams = streams.active_gpu_subset(num_radix_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  }

  void setup_degrees() {
@@ -382,14 +383,18 @@ struct int_radix_lut_custom_input_output {

  void allocate_pbs_buffers(int_radix_params params, uint32_t num_radix_blocks,
                            bool allocate_gpu_memory, uint64_t &size_tracker) {
+
+    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
+                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+
    for (uint i = 0; i < active_streams.count(); i++) {
      cuda_set_device(active_streams.gpu_index(i));
      int8_t *gpu_pbs_buffer;
-      auto num_blocks_on_gpu =
-          std::min((int)num_radix_blocks,
-                   std::max(THRESHOLD_MULTI_GPU,
-                            get_num_inputs_on_gpu(num_radix_blocks, i,
-                                                  active_streams.count())));
+      auto num_blocks_on_gpu = std::min(
+          (int)num_radix_blocks,
+          std::max(threshold, get_num_inputs_on_gpu(num_radix_blocks, i,
+                                                    active_streams.count())));

      uint64_t size = 0;
      execute_scratch_pbs<OutputTorus>(
@@ -424,18 +429,22 @@ struct int_radix_lut_custom_input_output {
    /// back to the original indexing
    multi_gpu_alloc_lwe_async(active_streams, lwe_array_in_vec,
                              num_radix_blocks, params.big_lwe_dimension + 1,
-                              size_tracker, allocate_gpu_memory);
+                              size_tracker, params.pbs_type,
+                              allocate_gpu_memory);
    multi_gpu_alloc_lwe_async(active_streams, lwe_after_ks_vec,
                              num_radix_blocks, params.small_lwe_dimension + 1,
-                              size_tracker, allocate_gpu_memory);
+                              size_tracker, params.pbs_type,
+                              allocate_gpu_memory);
    if (num_many_lut > 1) {
      multi_gpu_alloc_lwe_many_lut_output_async(
          active_streams, lwe_after_pbs_vec, num_radix_blocks, num_many_lut,
-          params.big_lwe_dimension + 1, size_tracker, allocate_gpu_memory);
+          params.big_lwe_dimension + 1, size_tracker, params.pbs_type,
+          allocate_gpu_memory);
    } else {
      multi_gpu_alloc_lwe_async(active_streams, lwe_after_pbs_vec,
                                num_radix_blocks, params.big_lwe_dimension + 1,
-                                size_tracker, allocate_gpu_memory);
+                                size_tracker, params.pbs_type,
+                                allocate_gpu_memory);
    }
    multi_gpu_alloc_array_async(active_streams, lwe_trivial_indexes_vec,
                                num_radix_blocks, size_tracker,
@@ -451,12 +460,14 @@ struct int_radix_lut_custom_input_output {
  }

  void setup_gemm_batch_ks_temp_buffers(uint64_t &size_tracker) {
+    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
+                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;

-    auto inputs_on_gpu =
-        std::min((int)num_input_blocks,
-                 std::max(THRESHOLD_MULTI_GPU,
-                          get_num_inputs_on_gpu(num_input_blocks, 0,
-                                                active_streams.count())));
+    auto inputs_on_gpu = std::min(
+        (int)num_input_blocks,
+        std::max(threshold, get_num_inputs_on_gpu(num_input_blocks, 0,
+                                                  active_streams.count())));

    if (inputs_on_gpu >= get_threshold_ks_gemm()) {
      for (auto i = 0; i < active_streams.count(); ++i) {
@@ -798,16 +809,20 @@ struct int_radix_lut_custom_input_output {
  void allocate_lwe_vector_for_non_trivial_indexes(
      CudaStreams streams, uint64_t max_num_radix_blocks,
      uint64_t &size_tracker, bool allocate_gpu_memory) {
+
+    int threshold = (params.pbs_type == PBS_TYPE::MULTI_BIT)
+                        ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                        : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+
    // We need to create the auxiliary array only in GPU 0
    if (active_streams.count() > 1) {
      lwe_aligned_vec.resize(active_streams.count());
      for (uint i = 0; i < active_streams.count(); i++) {
        uint64_t size_tracker_on_array_i = 0;
-        auto inputs_on_gpu =
-            std::min((int)max_num_radix_blocks,
-                     std::max(THRESHOLD_MULTI_GPU,
-                              get_num_inputs_on_gpu(max_num_radix_blocks, i,
-                                                    active_streams.count())));
+        auto inputs_on_gpu = std::min(
+            (int)max_num_radix_blocks,
+            std::max(threshold, get_num_inputs_on_gpu(max_num_radix_blocks, i,
+                                                      active_streams.count())));
        InputTorus *d_array =
            (InputTorus *)cuda_malloc_with_size_tracking_async(
                inputs_on_gpu * (params.big_lwe_dimension + 1) *
@@ -998,8 +1013,8 @@ template <typename Torus> struct int_bit_extract_luts_buffer {
        num_radix_blocks * bits_per_block * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);

-    auto active_streams =
-        streams.active_gpu_subset(bits_per_block * num_radix_blocks);
+    auto active_streams = streams.active_gpu_subset(
+        bits_per_block * num_radix_blocks, params.pbs_type);
    lut->broadcast_lut(active_streams);

    /**
@@ -1266,7 +1281,8 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
          luts_message_carry->get_max_degree(1), params.glwe_dimension,
          params.polynomial_size, message_modulus, params.carry_modulus,
          lut_f_carry, gpu_memory_allocated);
-      auto active_gpu_count_mc = streams.active_gpu_subset(pbs_count);
+      auto active_gpu_count_mc =
+          streams.active_gpu_subset(pbs_count, params.pbs_type);
      luts_message_carry->broadcast_lut(active_gpu_count_mc);
    }
  }
@@ -1436,7 +1452,8 @@ template <typename Torus> struct int_seq_group_prop_memory {
    cuda_memcpy_with_size_tracking_async_to_gpu(
        seq_lut_indexes, h_seq_lut_indexes, num_seq_luts * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams = streams.active_gpu_subset(num_seq_luts);
+    auto active_streams =
+        streams.active_gpu_subset(num_seq_luts, params.pbs_type);
    lut_sequential_algorithm->broadcast_lut(active_streams);
    free(h_seq_lut_indexes);
  };
@@ -1490,7 +1507,8 @@ template <typename Torus> struct int_hs_group_prop_memory {
        lut_hillis_steele->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_lut_hillis_steele,
        gpu_memory_allocated);
-    auto active_streams = streams.active_gpu_subset(num_groups);
+    auto active_streams =
+        streams.active_gpu_subset(num_groups, params.pbs_type);
    lut_hillis_steele->broadcast_lut(active_streams);
  };
  void release(CudaStreams streams) {
@@ -1667,7 +1685,8 @@ template <typename Torus> struct int_shifted_blocks_and_states_memory {
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_first_step->broadcast_lut(active_streams);
  };
  void release(CudaStreams streams) {
@@ -1932,7 +1951,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
        scalar_array_cum_sum, h_scalar_array_cum_sum,
        num_radix_blocks * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_second_step->broadcast_lut(active_streams);

    if (use_sequential_algorithm_to_resolve_group_carries) {
@@ -1957,7 +1977,8 @@ template <typename Torus> struct int_prop_simu_group_carries_memory {
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
-    auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
+    auto new_active_streams = streams.active_gpu_subset(
+        new_num_blocks, luts_array_second_step->params.pbs_type);
    // We just need to update the lut indexes so we use false here
    luts_array_second_step->broadcast_lut(new_active_streams, false);

@@ -2124,7 +2145,7 @@ template <typename Torus> struct int_sc_prop_memory {
          polynomial_size, message_modulus, carry_modulus, f_overflow_fp,
          gpu_memory_allocated);

-      auto active_streams = streams.active_gpu_subset(1);
+      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      lut_overflow_flag_prep->broadcast_lut(active_streams);
    }

@@ -2196,7 +2217,8 @@ template <typename Torus> struct int_sc_prop_memory {
          (num_radix_blocks + 1) * sizeof(Torus), streams.stream(0),
          streams.gpu_index(0), allocate_gpu_memory);
    }
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks + 1);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks + 1, params.pbs_type);
    lut_message_extract->broadcast_lut(active_streams);
  };

@@ -2393,7 +2415,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
        lut_indexes, h_lut_indexes, lut_indexes_size, streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
    // Do I need to do something else for the multi-gpu?
-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array_first_step->broadcast_lut(active_streams);
  };

@@ -2404,7 +2427,8 @@ template <typename Torus> struct int_shifted_blocks_and_borrow_states_memory {
    cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
        lut_indexes, new_lut_indexes, new_num_blocks * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), gpu_memory_allocated);
-    auto new_active_streams = streams.active_gpu_subset(new_num_blocks);
+    auto new_active_streams = streams.active_gpu_subset(
+        new_num_blocks, luts_array_first_step->params.pbs_type);
    // We just need to update the lut indexes so we use false here
    luts_array_first_step->broadcast_lut(new_active_streams, false);
  }
@@ -2499,7 +2523,8 @@ template <typename Torus> struct int_borrow_prop_memory {
        lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size,
        message_modulus, carry_modulus, f_message_extract,
        gpu_memory_allocated);
-    active_streams = streams.active_gpu_subset(num_radix_blocks);
+    active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);

    lut_message_extract->broadcast_lut(active_streams);

@@ -2520,7 +2545,8 @@ template <typename Torus> struct int_borrow_prop_memory {
      lut_borrow_flag->broadcast_lut(active_streams);
    }

-    active_streams = streams.active_gpu_subset(num_radix_blocks);
+    active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    internal_streams.create_internal_cuda_streams_on_same_gpus(active_streams,
                                                               2);
  };
--- a/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/multiplication.h
@@ -45,7 +45,8 @@ template <typename Torus> struct int_mul_memory {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          zero_out_predicate_lut_f, gpu_memory_allocated);

-      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      zero_out_predicate_lut->broadcast_lut(active_streams);

      zero_out_mem = new int_zero_out_if_buffer<Torus>(
@@ -122,7 +123,8 @@ template <typename Torus> struct int_mul_memory {
          streams.stream(0), streams.gpu_index(0),
          luts_array->get_lut_indexes(0, lsb_vector_block_count), 1,
          msb_vector_block_count);
-    auto active_streams = streams.active_gpu_subset(total_block_count);
+    auto active_streams =
+        streams.active_gpu_subset(total_block_count, params.pbs_type);
    luts_array->broadcast_lut(active_streams);
    // create memory object for sum ciphertexts
    sum_ciphertexts_mem = new int_sum_ciphertexts_vec_memory<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/oprf.h
@@ -126,7 +126,8 @@ template <typename Torus> struct int_grouped_oprf_memory {
        luts->get_lut_indexes(0, 0), this->h_lut_indexes,
        num_blocks_to_process * sizeof(Torus), streams.stream(0),
        streams.gpu_index(0), allocate_gpu_memory);
-    auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
+    auto active_streams =
+        streams.active_gpu_subset(num_blocks_to_process, params.pbs_type);
    luts->broadcast_lut(active_streams);

    cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
--- a/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/scalar_shifts.h
@@ -91,7 +91,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          shift_lut_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      cur_lut_bivariate->broadcast_lut(active_streams);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
@@ -177,7 +178,8 @@ template <typename Torus> struct int_logical_scalar_shift_buffer {
          cur_lut_bivariate->get_max_degree(0), params.glwe_dimension,
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          shift_lut_f, gpu_memory_allocated);
-      auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+      auto active_streams =
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      cur_lut_bivariate->broadcast_lut(active_streams);

      lut_buffers_bivariate.push_back(cur_lut_bivariate);
@@ -220,7 +222,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
                                     uint64_t &size_tracker) {
    gpu_memory_allocated = allocate_gpu_memory;

-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    // In the arithmetic shift, a PBS has to be applied to the last rotated
    // block twice: once to shift it, once to compute the padding block to be
    // copied onto all blocks to the left of the last rotated block
@@ -276,7 +278,8 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
          shift_last_block_lut_univariate->get_max_degree(0),
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, last_block_lut_f, gpu_memory_allocated);
-      auto active_streams_shift_last = streams.active_gpu_subset(1);
+      auto active_streams_shift_last =
+          streams.active_gpu_subset(1, params.pbs_type);
      shift_last_block_lut_univariate->broadcast_lut(active_streams_shift_last);

      lut_buffers_univariate.push_back(shift_last_block_lut_univariate);
@@ -302,7 +305,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
        padding_block_lut_univariate->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        padding_block_lut_f, gpu_memory_allocated);
-    // auto active_streams = streams.active_gpu_subset(1);
+    // auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    padding_block_lut_univariate->broadcast_lut(active_streams);

    lut_buffers_univariate.push_back(padding_block_lut_univariate);
@@ -344,7 +347,7 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          blocks_lut_f, gpu_memory_allocated);
      auto active_streams_shift_blocks =
-          streams.active_gpu_subset(num_radix_blocks);
+          streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
      shift_blocks_lut_bivariate->broadcast_lut(active_streams_shift_blocks);

      lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/shift_and_rotate.h
@@ -119,8 +119,8 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
        mux_lut->get_degree(0), mux_lut->get_max_degree(0),
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, mux_lut_f, gpu_memory_allocated);
-    auto active_gpu_count_mux =
-        streams.active_gpu_subset(bits_per_block * num_radix_blocks);
+    auto active_gpu_count_mux = streams.active_gpu_subset(
+        bits_per_block * num_radix_blocks, params.pbs_type);
    mux_lut->broadcast_lut(active_gpu_count_mux);

    auto cleaning_lut_f = [params](Torus x) -> Torus {
@@ -132,7 +132,7 @@ template <typename Torus> struct int_shift_and_rotate_buffer {
        params.glwe_dimension, params.polynomial_size, params.message_modulus,
        params.carry_modulus, cleaning_lut_f, gpu_memory_allocated);
    auto active_gpu_count_cleaning =
-        streams.active_gpu_subset(num_radix_blocks);
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    cleaning_lut->broadcast_lut(active_gpu_count_cleaning);
  }

--- a/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/subtraction.h
@@ -108,7 +108,8 @@ template <typename Torus> struct int_overflowing_sub_memory {
        glwe_dimension, polynomial_size, message_modulus, carry_modulus,
        f_message_acc, gpu_memory_allocated);

-    auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+    auto active_streams =
+        streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
    luts_array->broadcast_lut(active_streams);
    luts_borrow_propagation_sum->broadcast_lut(active_streams);
    message_acc->broadcast_lut(active_streams);
--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_comparison.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_comparison.h
@@ -38,7 +38,8 @@ template <typename Torus> struct int_unchecked_all_eq_slices_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    uint32_t num_gpus = active_streams.count();

--- a/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer/vector_find.h
@@ -40,7 +40,8 @@ template <typename Torus> struct int_equality_selectors_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -154,7 +155,8 @@ template <typename Torus> struct int_possible_results_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -207,7 +209,8 @@ template <typename Torus> struct int_possible_results_buffer {
            params.message_modulus, params.carry_modulus, fns,
            allocate_gpu_memory);

-        current_lut->broadcast_lut(streams.active_gpu_subset(1));
+        current_lut->broadcast_lut(
+            streams.active_gpu_subset(1, params.pbs_type));
        stream_luts[lut_count++] = current_lut;
        lut_value_start += luts_in_this_call;
      }
@@ -282,7 +285,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {

    this->num_streams = num_streams_to_use;

-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams);
@@ -300,7 +304,8 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
          params.polynomial_size, params.message_modulus, params.carry_modulus,
          id_fn, allocate_gpu_memory);

-      lut->broadcast_lut(streams.active_gpu_subset(num_blocks));
+      lut->broadcast_lut(
+          streams.active_gpu_subset(num_blocks, params.pbs_type));
      this->stream_identity_luts[i] = lut;
    }

@@ -321,7 +326,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        msg_fn, allocate_gpu_memory);
    this->message_extract_lut->broadcast_lut(
-        streams.active_gpu_subset(num_blocks));
+        streams.active_gpu_subset(num_blocks, params.pbs_type));

    this->carry_extract_lut = new int_radix_lut<Torus>(
        streams, params, 1, num_blocks, allocate_gpu_memory, size_tracker);
@@ -333,7 +338,7 @@ template <typename Torus> struct int_aggregate_one_hot_buffer {
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        carry_fn, allocate_gpu_memory);
    this->carry_extract_lut->broadcast_lut(
-        streams.active_gpu_subset(num_blocks));
+        streams.active_gpu_subset(num_blocks, params.pbs_type));

    this->partial_aggregated_vectors =
        new CudaRadixCiphertextFFI *[num_streams];
@@ -628,7 +633,8 @@ template <typename Torus> struct int_unchecked_contains_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -703,7 +709,8 @@ template <typename Torus> struct int_unchecked_contains_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1094,7 +1101,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1184,7 +1192,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        prefix_sum_fn, allocate_gpu_memory);
-    this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+    this->prefix_sum_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1200,7 +1209,8 @@ template <typename Torus> struct int_unchecked_first_index_of_clear_buffer {
        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        cleanup_fn, allocate_gpu_memory);
-    this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+    this->cleanup_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));
  }

  void release(CudaStreams streams) {
@@ -1292,7 +1302,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1372,7 +1383,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
        this->prefix_sum_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        prefix_sum_fn, allocate_gpu_memory);
-    this->prefix_sum_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+    this->prefix_sum_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));

    auto cleanup_fn = [ALREADY_SEEN, params](Torus x) -> Torus {
      Torus val = x % params.message_modulus;
@@ -1388,7 +1400,8 @@ template <typename Torus> struct int_unchecked_first_index_of_buffer {
        this->cleanup_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        cleanup_fn, allocate_gpu_memory);
-    this->cleanup_lut->broadcast_lut(streams.active_gpu_subset(num_inputs));
+    this->cleanup_lut->broadcast_lut(
+        streams.active_gpu_subset(num_inputs, params.pbs_type));
  }

  void release(CudaStreams streams) {
@@ -1462,7 +1475,8 @@ template <typename Torus> struct int_unchecked_index_of_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
@@ -1523,7 +1537,8 @@ template <typename Torus> struct int_unchecked_index_of_clear_buffer {
      num_streams_to_use = 1;

    this->num_streams = num_streams_to_use;
-    this->active_streams = streams.active_gpu_subset(num_blocks);
+    this->active_streams =
+        streams.active_gpu_subset(num_blocks, params.pbs_type);

    this->internal_cuda_streams.create_internal_cuda_streams_on_same_gpus(
        active_streams, num_streams_to_use);
--- a/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
+++ b/backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h
@@ -289,7 +289,8 @@ template <typename Torus> struct zk_expand_mem {
        lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
        streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);

-    auto active_streams = streams.active_gpu_subset(2 * num_lwes);
+    auto active_streams =
+        streams.active_gpu_subset(2 * num_lwes, params.pbs_type);
    message_and_carry_extract_luts->broadcast_lut(active_streams);

    message_and_carry_extract_luts->allocate_lwe_vector_for_non_trivial_indexes(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -153,7 +153,8 @@ __host__ void are_all_comparisons_block_true(
        cuda_memcpy_async_to_gpu(is_max_value_lut->get_lut_indexes(0, 0),
                                 h_lut_indexes, num_chunks * sizeof(Torus),
                                 streams.stream(0), streams.gpu_index(0));
-        auto active_streams = streams.active_gpu_subset(num_chunks);
+        auto active_streams =
+            streams.active_gpu_subset(num_chunks, params.pbs_type);
        is_max_value_lut->broadcast_lut(active_streams);
      }
      lut = is_max_value_lut;
@@ -172,8 +173,8 @@ __host__ void are_all_comparisons_block_true(
                               is_max_value_lut->h_lut_indexes,
                               is_max_value_lut->num_blocks * sizeof(Torus),
                               streams.stream(0), streams.gpu_index(0));
-      auto active_gpu_count_is_max =
-          streams.active_gpu_subset(is_max_value_lut->num_blocks);
+      auto active_gpu_count_is_max = streams.active_gpu_subset(
+          is_max_value_lut->num_blocks, params.pbs_type);
      is_max_value_lut->broadcast_lut(active_gpu_count_is_max, false);

      reset_radix_ciphertext_blocks(lwe_array_out, 1);
@@ -488,7 +489,7 @@ tree_sign_reduction(CudaStreams streams, CudaRadixCiphertextFFI *lwe_array_out,
      polynomial_size, message_modulus, carry_modulus, f, true,
      tree_buffer->preallocated_h_lut);

-  auto active_streams = streams.active_gpu_subset(1);
+  auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
  last_lut->broadcast_lut(active_streams);

  // Last leaf
--- a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -339,7 +339,9 @@ host_integer_decompress(CudaStreams streams,
    /// dimension to a big LWE dimension
    auto encryption_params = h_mem_ptr->encryption_params;
    auto lut = h_mem_ptr->decompression_rescale_lut;
-    auto active_streams = streams.active_gpu_subset(num_blocks_to_decompress);
+    auto active_streams = streams.active_gpu_subset(
+        num_blocks_to_decompress,
+        h_mem_ptr->decompression_rescale_lut->params.pbs_type);
    if (active_streams.count() == 1) {
      execute_pbs_async<Torus, Torus>(
          active_streams, (Torus *)d_lwe_array_out->ptr, lut->lwe_indexes_out,
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -542,7 +542,8 @@ __host__ void integer_radix_apply_univariate_lookup_table(
  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -645,7 +646,8 @@ __host__ void integer_radix_apply_many_univariate_lookup_table(
  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -764,7 +766,8 @@ __host__ void integer_radix_apply_bivariate_lookup_table(
  std::vector<Torus *> lwe_after_pbs_vec = lut->lwe_after_pbs_vec;
  std::vector<Torus *> lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec;

-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<Torus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
@@ -1812,7 +1815,8 @@ uint64_t scratch_cuda_apply_univariate_lut(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  (*mem_ptr)->broadcast_lut(active_streams);
  POP_RANGE()
  return size_tracker;
@@ -1847,7 +1851,8 @@ uint64_t scratch_cuda_apply_many_univariate_lut(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  (*mem_ptr)->broadcast_lut(active_streams);
  POP_RANGE()
  return size_tracker;
@@ -1883,7 +1888,8 @@ uint64_t scratch_cuda_apply_bivariate_lut(
      (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus),
      streams.stream(0), streams.gpu_index(0), allocate_gpu_memory);
  *(*mem_ptr)->get_degree(0) = lut_degree;
-  auto active_streams = streams.active_gpu_subset(num_radix_blocks);
+  auto active_streams =
+      streams.active_gpu_subset(num_radix_blocks, params.pbs_type);
  (*mem_ptr)->broadcast_lut(active_streams);
  POP_RANGE()
  return size_tracker;
@@ -2336,8 +2342,8 @@ integer_radix_apply_noise_squashing(CudaStreams streams,

  // Since the radix ciphertexts are packed, we have to use the num_radix_blocks
  // from the output ct
-  auto active_streams =
-      streams.active_gpu_subset(lwe_array_out->num_radix_blocks);
+  auto active_streams = streams.active_gpu_subset(
+      lwe_array_out->num_radix_blocks, params.pbs_type);
  if (active_streams.count() == 1) {
    execute_keyswitch_async<InputTorus>(
        streams.get_ith(0), lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0],
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -388,7 +388,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
    current_columns.next_accumulation(total_ciphertexts, total_messages,
                                      needs_processing);

-    auto active_streams = streams.active_gpu_subset(total_ciphertexts);
+    auto active_streams =
+        streams.active_gpu_subset(total_ciphertexts, mem_ptr->params.pbs_type);
    GPU_ASSERT(total_ciphertexts <= mem_ptr->luts_message_carry->num_blocks,
               "SUM CT");

@@ -442,7 +443,8 @@ __host__ void host_integer_partial_sum_ciphertexts_vec(
        streams.stream(0), streams.gpu_index(0), current_blocks,
        num_radix_blocks, num_radix_blocks + 1);

-    auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks);
+    auto active_streams = streams.active_gpu_subset(2 * num_radix_blocks,
+                                                    mem_ptr->params.pbs_type);

    if (active_streams.count() == 1) {
      execute_keyswitch_async<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/oprf.cuh
@@ -29,7 +29,8 @@ void host_integer_grouped_oprf(CudaStreams streams,
                               int_grouped_oprf_memory<Torus> *mem_ptr,
                               void *const *bsks) {

-  auto active_streams = streams.active_gpu_subset(num_blocks_to_process);
+  auto active_streams = streams.active_gpu_subset(num_blocks_to_process,
+                                                  mem_ptr->params.pbs_type);
  auto lut = mem_ptr->luts;

  if (active_streams.count() == 1) {
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -45,7 +45,8 @@ host_scalar_bitop(CudaStreams streams, CudaRadixCiphertextFFI *output,
    cuda_memcpy_async_gpu_to_gpu(lut->get_lut_indexes(0, 0), clear_blocks,
                                 num_clear_blocks * sizeof(Torus),
                                 streams.stream(0), streams.gpu_index(0));
-    auto active_streams = streams.active_gpu_subset(num_clear_blocks);
+    auto active_streams = streams.active_gpu_subset(
+        num_clear_blocks, mem_ptr->lut->params.pbs_type);
    lut->broadcast_lut(active_streams, false);

    integer_radix_apply_univariate_lookup_table<Torus>(
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -146,7 +146,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
        lut->get_degree(0), lut->get_max_degree(0), glwe_dimension,
        polynomial_size, message_modulus, carry_modulus, scalar_last_leaf_lut_f,
        true, mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    lut->broadcast_lut(active_streams);

    integer_radix_apply_univariate_lookup_table<Torus>(
@@ -240,7 +240,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
        polynomial_size, message_modulus, carry_modulus,
        scalar_bivariate_last_leaf_lut_f, true,
        mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    lut->broadcast_lut(active_streams);

    integer_radix_apply_bivariate_lookup_table<Torus>(
@@ -274,7 +274,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check(
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, one_block_lut_f, true,
          mem_ptr->preallocated_h_lut);
-      auto active_streams = streams.active_gpu_subset(1);
+      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      one_block_lut->broadcast_lut(active_streams);

      integer_radix_apply_univariate_lookup_table<Torus>(
@@ -419,7 +419,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
        polynomial_size, message_modulus, carry_modulus,
        scalar_bivariate_last_leaf_lut_f, true,
        mem_ptr->diff_buffer->tree_buffer->preallocated_h_lut);
-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    lut->broadcast_lut(active_streams);

    integer_radix_apply_bivariate_lookup_table<Torus>(
@@ -521,7 +521,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
        signed_msb_lut->get_max_degree(0), params.glwe_dimension,
        params.polynomial_size, params.message_modulus, params.carry_modulus,
        lut_f, true, mem_ptr->preallocated_h_lut);
-    auto active_streams = streams.active_gpu_subset(1);
+    auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
    signed_msb_lut->broadcast_lut(active_streams);

    CudaRadixCiphertextFFI sign_block;
@@ -567,7 +567,7 @@ __host__ void integer_radix_signed_scalar_difference_check(
          params.glwe_dimension, params.polynomial_size, params.message_modulus,
          params.carry_modulus, one_block_lut_f, true,
          mem_ptr->preallocated_h_lut);
-      auto active_streams = streams.active_gpu_subset(1);
+      auto active_streams = streams.active_gpu_subset(1, params.pbs_type);
      one_block_lut->broadcast_lut(active_streams);

      integer_radix_apply_univariate_lookup_table<Torus>(
@@ -785,8 +785,8 @@ __host__ void host_scalar_equality_check(
          num_halved_scalar_blocks * sizeof(Torus), lsb_streams.stream(0),
          lsb_streams.gpu_index(0));
    }
-    auto active_streams =
-        lsb_streams.active_gpu_subset(num_halved_scalar_blocks);
+    auto active_streams = lsb_streams.active_gpu_subset(
+        num_halved_scalar_blocks, params.pbs_type);
    // We use false cause we only will broadcast the indexes
    scalar_comparison_luts->broadcast_lut(active_streams, false);

--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cu
@@ -5,7 +5,8 @@

 std::mutex m;
 bool p2p_enabled = false;
-const int THRESHOLD_MULTI_GPU = 12;
+const int THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS = 12;
+const int THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS = 68;

 // Enable bidirectional p2p access between all available GPUs and device_0_id
 int32_t cuda_setup_multi_gpu(int device_0_id) {
@@ -39,10 +40,13 @@ int32_t cuda_setup_multi_gpu(int device_0_id) {
  return (int32_t)(num_used_gpus);
 }

-uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count) {
+uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count,
+                              PBS_TYPE pbs_type) {
+  int threshold = (pbs_type == MULTI_BIT)
+                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
  uint32_t ceil_div_inputs =
-      std::max((uint32_t)1,
-               (num_inputs + THRESHOLD_MULTI_GPU - 1) / THRESHOLD_MULTI_GPU);
+      std::max((uint32_t)1, (num_inputs + threshold - 1) / threshold);
  uint32_t active_gpu_count = std::min(ceil_div_inputs, gpu_count);
  return active_gpu_count;
 }
--- a/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/utils/helper_multi_gpu.cuh
@@ -59,15 +59,20 @@ template <typename Torus>
 void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,
                               uint32_t num_inputs, uint32_t lwe_size,
                               uint64_t &size_tracker_on_gpu_0,
-                               bool allocate_gpu_memory) {
+                               PBS_TYPE pbs_type, bool allocate_gpu_memory) {
  PANIC_IF_FALSE(dest.empty(),
                 "Cuda error: Requested multi-GPU vector is already allocated");
+
+  int threshold = (pbs_type == MULTI_BIT)
+                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+
  dest.resize(streams.count());
  for (uint i = 0; i < streams.count(); i++) {
    uint64_t size_tracker_on_gpu_i = 0;
    auto inputs_on_gpu = std::min(
        (int)num_inputs,
-        std::max(THRESHOLD_MULTI_GPU,
+        std::max((int)threshold,
                 get_num_inputs_on_gpu(num_inputs, i, streams.count())));
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        inputs_on_gpu * lwe_size * sizeof(Torus), streams.stream(i),
@@ -81,7 +86,7 @@ void multi_gpu_alloc_lwe_async(CudaStreams streams, std::vector<Torus *> &dest,

 template void multi_gpu_alloc_lwe_async<__uint128_t>(
    CudaStreams streams, std::vector<__uint128_t *> &dest, uint32_t num_inputs,
-    uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
+    uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0, PBS_TYPE pbs_type,
    bool allocate_gpu_memory);

 /// Allocates the input/output vector for all devices
@@ -91,16 +96,21 @@ template <typename Torus>
 void multi_gpu_alloc_lwe_many_lut_output_async(
    CudaStreams streams, std::vector<Torus *> &dest, uint32_t num_inputs,
    uint32_t num_many_lut, uint32_t lwe_size, uint64_t &size_tracker_on_gpu_0,
-    bool allocate_gpu_memory) {
+    PBS_TYPE pbs_type, bool allocate_gpu_memory) {

  PANIC_IF_FALSE(dest.empty(),
                 "Cuda error: Requested multi-GPU vector is already allocated");
+
+  int threshold = (pbs_type == MULTI_BIT)
+                      ? THRESHOLD_MULTI_GPU_WITH_MULTI_BIT_PARAMS
+                      : THRESHOLD_MULTI_GPU_WITH_CLASSICAL_PARAMS;
+
  dest.resize(streams.count());
  for (uint i = 0; i < streams.count(); i++) {
    uint64_t size_tracker = 0;
    auto inputs_on_gpu = std::min(
        (int)num_inputs,
-        std::max(THRESHOLD_MULTI_GPU,
+        std::max((int)threshold,
                 get_num_inputs_on_gpu(num_inputs, i, streams.count())));
    Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
        num_many_lut * inputs_on_gpu * lwe_size * sizeof(Torus),